sprint-econtai/pipeline/classification.py
Félix Dorn 65dc648797 wip
2025-07-15 00:34:54 +02:00

225 lines
9.8 KiB
Python

from pathlib import Path
import pandas as pd
from .logger import logger
from .utils import enrich
import json
ALLOWED_UNITS = [
"minute",
"hour",
"day",
"week",
"month",
"trimester",
"semester",
"year",
]
ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
TIME_ESTIMATES_GENERATION_VERSION = "old_version"
def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
if CACHE_PATH.exists() and not bust:
logger.info(f"Loading cached task estimability from {CACHE_PATH}")
return pd.read_parquet(CACHE_PATH)
logger.info("Enriching tasks with estimability classification.")
df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
if df_unique_tasks.empty:
raise ValueError("No unique tasks to classify.")
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{"role": "system", "content": """
Classify the provided O*NET task into one of these categories:
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
""".strip()},
{"role": "user", "content": f"Task: {row.task}"},
]
for row in df_unique_tasks.itertuples()
],
schema={
"name": "estimability_classification",
"schema": {
"type": "object",
"properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
"required": ["task_category"],
"additionalProperties": False
}
},
chunk_size=300,
)
if not results or len(results) != len(df_unique_tasks):
raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
classifications = []
for index, response in enumerate(results):
task_label = df_unique_tasks.iloc[index]['task']
task_category_flag = None
if response is None:
logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
else:
try:
content_str = response.choices[0].message.content
if not content_str:
raise ValueError("No content found in the response message")
data = json.loads(content_str)
if 'task_category' in data and isinstance(data['task_category'], str):
task_category_flag = data['task_category']
else:
logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
classifications.append({
'task': task_label,
'estimable': task_category_flag == 'ATOMIC'
})
classification_df = pd.DataFrame(classifications)
logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
classification_df.to_parquet(CACHE_PATH)
return classification_df
def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
if CACHE_PATH.exists() and not bust:
logger.info(f"Loading cached task estimates from {CACHE_PATH}")
return pd.read_parquet(CACHE_PATH)
logger.info("Enriching tasks with time estimates.")
if df_to_process.empty:
raise ValueError("No tasks to process for estimates.")
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{
"role": "system",
"content": """
You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
},
{
"role": "user",
"content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
}
]
for row in df_to_process.itertuples()
],
schema= {
"name": "estimate_time",
"strict": True,
"schema": {
"type": "object",
"properties": {
"lower_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the lower bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the lower bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
"upper_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the upper bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the upper bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
},
"required": ["lower_bound_estimate", "upper_bound_estimate"],
"additionalProperties": False,
},
},
chunk_size=200,
)
if not results or len(results) != len(df_to_process):
raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
estimates = []
for index, response in enumerate(results):
row = df_to_process.iloc[index]
task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
if response is None:
logger.warning(f"API call failed for task (enrich returned None): {task_info}")
else:
try:
content_str = response.choices[0].message.content
if not content_str:
raise ValueError("No content found in the response message")
data = json.loads(content_str)
lb_qty = data['lower_bound_estimate']['quantity']
lb_unit = data['lower_bound_estimate']['unit']
ub_qty = data['upper_bound_estimate']['quantity']
ub_unit = data['upper_bound_estimate']['unit']
except Exception as e:
logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
estimates.append({
'onetsoc_code': row.onetsoc_code,
'task_id': row.task_id,
'lb_estimate_qty': lb_qty,
'lb_estimate_unit': lb_unit,
'ub_estimate_qty': ub_qty,
'ub_estimate_unit': ub_unit
})
estimates_df = pd.DataFrame(estimates)
logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
logger.info(f"Saving task estimates to {CACHE_PATH}")
estimates_df.to_parquet(CACHE_PATH)
return estimates_df