225 lines
9.8 KiB
Python
225 lines
9.8 KiB
Python
from pathlib import Path
|
|
import pandas as pd
|
|
from .logger import logger
|
|
from .utils import enrich
|
|
import json
|
|
|
|
ALLOWED_UNITS = [
|
|
"minute",
|
|
"hour",
|
|
"day",
|
|
"week",
|
|
"month",
|
|
"trimester",
|
|
"semester",
|
|
"year",
|
|
]
|
|
|
|
ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
|
|
TIME_ESTIMATES_GENERATION_VERSION = "old_version"
|
|
|
|
def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
|
CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
|
|
if CACHE_PATH.exists() and not bust:
|
|
logger.info(f"Loading cached task estimability from {CACHE_PATH}")
|
|
return pd.read_parquet(CACHE_PATH)
|
|
|
|
logger.info("Enriching tasks with estimability classification.")
|
|
|
|
df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
|
|
|
|
logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
|
|
|
|
if df_unique_tasks.empty:
|
|
raise ValueError("No unique tasks to classify.")
|
|
|
|
results = enrich(
|
|
model="gpt-4.1-mini",
|
|
rpm=5000,
|
|
messages_to_process=[
|
|
[
|
|
{"role": "system", "content": """
|
|
Classify the provided O*NET task into one of these categories:
|
|
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
|
|
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
|
|
""".strip()},
|
|
{"role": "user", "content": f"Task: {row.task}"},
|
|
]
|
|
for row in df_unique_tasks.itertuples()
|
|
],
|
|
schema={
|
|
"name": "estimability_classification",
|
|
"schema": {
|
|
"type": "object",
|
|
"properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
|
|
"required": ["task_category"],
|
|
"additionalProperties": False
|
|
}
|
|
},
|
|
chunk_size=300,
|
|
)
|
|
|
|
if not results or len(results) != len(df_unique_tasks):
|
|
raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
|
|
|
|
classifications = []
|
|
for index, response in enumerate(results):
|
|
task_label = df_unique_tasks.iloc[index]['task']
|
|
task_category_flag = None
|
|
|
|
if response is None:
|
|
logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
|
|
else:
|
|
try:
|
|
content_str = response.choices[0].message.content
|
|
if not content_str:
|
|
raise ValueError("No content found in the response message")
|
|
|
|
data = json.loads(content_str)
|
|
|
|
if 'task_category' in data and isinstance(data['task_category'], str):
|
|
task_category_flag = data['task_category']
|
|
else:
|
|
logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
|
|
except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
|
|
logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
|
|
|
|
classifications.append({
|
|
'task': task_label,
|
|
'estimable': task_category_flag == 'ATOMIC'
|
|
})
|
|
|
|
classification_df = pd.DataFrame(classifications)
|
|
|
|
logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
|
|
|
|
logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
|
|
classification_df.to_parquet(CACHE_PATH)
|
|
|
|
return classification_df
|
|
|
|
|
|
def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
|
CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
|
|
if CACHE_PATH.exists() and not bust:
|
|
logger.info(f"Loading cached task estimates from {CACHE_PATH}")
|
|
return pd.read_parquet(CACHE_PATH)
|
|
|
|
logger.info("Enriching tasks with time estimates.")
|
|
|
|
if df_to_process.empty:
|
|
raise ValueError("No tasks to process for estimates.")
|
|
|
|
results = enrich(
|
|
model="gpt-4.1-mini",
|
|
rpm=5000,
|
|
messages_to_process=[
|
|
[
|
|
{
|
|
"role": "system",
|
|
"content": """
|
|
You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
|
|
|
|
'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
|
|
|
|
Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
|
|
|
|
Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
|
|
}
|
|
]
|
|
for row in df_to_process.itertuples()
|
|
],
|
|
schema= {
|
|
"name": "estimate_time",
|
|
"strict": True,
|
|
"schema": {
|
|
"type": "object",
|
|
"properties": {
|
|
"lower_bound_estimate": {
|
|
"type": "object",
|
|
"properties": {
|
|
"quantity": {
|
|
"type": "number",
|
|
"description": "The numerical value for the lower bound of the estimate.",
|
|
},
|
|
"unit": {
|
|
"type": "string",
|
|
"enum": ALLOWED_UNITS,
|
|
"description": "The unit of time for the lower bound.",
|
|
},
|
|
},
|
|
"required": ["quantity", "unit"],
|
|
"additionalProperties": False,
|
|
},
|
|
"upper_bound_estimate": {
|
|
"type": "object",
|
|
"properties": {
|
|
"quantity": {
|
|
"type": "number",
|
|
"description": "The numerical value for the upper bound of the estimate.",
|
|
},
|
|
"unit": {
|
|
"type": "string",
|
|
"enum": ALLOWED_UNITS,
|
|
"description": "The unit of time for the upper bound.",
|
|
},
|
|
},
|
|
"required": ["quantity", "unit"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
chunk_size=200,
|
|
)
|
|
|
|
if not results or len(results) != len(df_to_process):
|
|
raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
|
|
f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
|
|
|
|
estimates = []
|
|
for index, response in enumerate(results):
|
|
row = df_to_process.iloc[index]
|
|
task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
|
|
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
|
|
|
|
if response is None:
|
|
logger.warning(f"API call failed for task (enrich returned None): {task_info}")
|
|
else:
|
|
try:
|
|
content_str = response.choices[0].message.content
|
|
if not content_str:
|
|
raise ValueError("No content found in the response message")
|
|
|
|
data = json.loads(content_str)
|
|
|
|
lb_qty = data['lower_bound_estimate']['quantity']
|
|
lb_unit = data['lower_bound_estimate']['unit']
|
|
ub_qty = data['upper_bound_estimate']['quantity']
|
|
ub_unit = data['upper_bound_estimate']['unit']
|
|
except Exception as e:
|
|
logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
|
|
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
|
|
|
|
estimates.append({
|
|
'onetsoc_code': row.onetsoc_code,
|
|
'task_id': row.task_id,
|
|
'lb_estimate_qty': lb_qty,
|
|
'lb_estimate_unit': lb_unit,
|
|
'ub_estimate_qty': ub_qty,
|
|
'ub_estimate_unit': ub_unit
|
|
})
|
|
|
|
estimates_df = pd.DataFrame(estimates)
|
|
logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
|
|
|
|
logger.info(f"Saving task estimates to {CACHE_PATH}")
|
|
estimates_df.to_parquet(CACHE_PATH)
|
|
|
|
return estimates_df
|