from pathlib import Path import pandas as pd from .logger import logger from .utils import enrich import json ALLOWED_UNITS = [ "minute", "hour", "day", "week", "month", "trimester", "semester", "year", ] ESTIMABLE_CLASSIFICATION_VERSION = "old_version" TIME_ESTIMATES_GENERATION_VERSION = "old_version" def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame: CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet" if CACHE_PATH.exists() and not bust: logger.info(f"Loading cached task estimability from {CACHE_PATH}") return pd.read_parquet(CACHE_PATH) logger.info("Enriching tasks with estimability classification.") df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy() logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.") if df_unique_tasks.empty: raise ValueError("No unique tasks to classify.") results = enrich( model="gpt-4.1-mini", rpm=5000, messages_to_process=[ [ {"role": "system", "content": """ Classify the provided O*NET task into one of these categories: - ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. - ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). """.strip()}, {"role": "user", "content": f"Task: {row.task}"}, ] for row in df_unique_tasks.itertuples() ], schema={ "name": "estimability_classification", "schema": { "type": "object", "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}}, "required": ["task_category"], "additionalProperties": False } }, chunk_size=300, ) if not results or len(results) != len(df_unique_tasks): raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.") classifications = [] for index, response in enumerate(results): task_label = df_unique_tasks.iloc[index]['task'] task_category_flag = None if response is None: logger.warning(f"API call failed for task (enrich returned None): '{task_label}'") else: try: content_str = response.choices[0].message.content if not content_str: raise ValueError("No content found in the response message") data = json.loads(content_str) if 'task_category' in data and isinstance(data['task_category'], str): task_category_flag = data['task_category'] else: logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'") except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e: logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}") classifications.append({ 'task': task_label, 'estimable': task_category_flag == 'ATOMIC' }) classification_df = pd.DataFrame(classifications) logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.") logger.info(f"Saving task estimability classifications to {CACHE_PATH}") classification_df.to_parquet(CACHE_PATH) return classification_df def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame: CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet" if CACHE_PATH.exists() and not bust: logger.info(f"Loading cached task estimates from {CACHE_PATH}") return pd.read_parquet(CACHE_PATH) logger.info("Enriching tasks with time estimates.") if df_to_process.empty: raise ValueError("No tasks to process for estimates.") results = enrich( model="gpt-4.1-mini", rpm=5000, messages_to_process=[ [ { "role": "system", "content": """ You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision 'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost. Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip() }, { "role": "user", "content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})" } ] for row in df_to_process.itertuples() ], schema= { "name": "estimate_time", "strict": True, "schema": { "type": "object", "properties": { "lower_bound_estimate": { "type": "object", "properties": { "quantity": { "type": "number", "description": "The numerical value for the lower bound of the estimate.", }, "unit": { "type": "string", "enum": ALLOWED_UNITS, "description": "The unit of time for the lower bound.", }, }, "required": ["quantity", "unit"], "additionalProperties": False, }, "upper_bound_estimate": { "type": "object", "properties": { "quantity": { "type": "number", "description": "The numerical value for the upper bound of the estimate.", }, "unit": { "type": "string", "enum": ALLOWED_UNITS, "description": "The unit of time for the upper bound.", }, }, "required": ["quantity", "unit"], "additionalProperties": False, }, }, "required": ["lower_bound_estimate", "upper_bound_estimate"], "additionalProperties": False, }, }, chunk_size=200, ) if not results or len(results) != len(df_to_process): raise ValueError(f"API call for task estimates failed or returned mismatched number of results. " f"Expected {len(df_to_process)}, got {len(results) if results else 0}.") estimates = [] for index, response in enumerate(results): row = df_to_process.iloc[index] task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}" lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None if response is None: logger.warning(f"API call failed for task (enrich returned None): {task_info}") else: try: content_str = response.choices[0].message.content if not content_str: raise ValueError("No content found in the response message") data = json.loads(content_str) lb_qty = data['lower_bound_estimate']['quantity'] lb_unit = data['lower_bound_estimate']['unit'] ub_qty = data['upper_bound_estimate']['quantity'] ub_unit = data['upper_bound_estimate']['unit'] except Exception as e: logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}") lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure estimates.append({ 'onetsoc_code': row.onetsoc_code, 'task_id': row.task_id, 'lb_estimate_qty': lb_qty, 'lb_estimate_unit': lb_unit, 'ub_estimate_qty': ub_qty, 'ub_estimate_unit': ub_unit }) estimates_df = pd.DataFrame(estimates) logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.") logger.info(f"Saving task estimates to {CACHE_PATH}") estimates_df.to_parquet(CACHE_PATH) return estimates_df