sprint-econtai/pipeline/classification.py

from pathlib import Path
import pandas as pd
from .logger import logger
from .utils import enrich
import json

ALLOWED_UNITS = [
    "minute",
    "hour",
    "day",
    "week",
    "month",
    "trimester",
    "semester",
    "year",
]

ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
TIME_ESTIMATES_GENERATION_VERSION = "old_version"

def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
    CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
    if CACHE_PATH.exists() and not bust:
        logger.info(f"Loading cached task estimability from {CACHE_PATH}")
        return pd.read_parquet(CACHE_PATH)

    logger.info("Enriching tasks with estimability classification.")

    df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()

    logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")

    if df_unique_tasks.empty:
        raise ValueError("No unique tasks to classify.")

    results = enrich(
        model="gpt-4.1-mini",
        rpm=5000,
        messages_to_process=[
            [
                {"role": "system", "content":  """
                    Classify the provided O*NET task into one of these categories:
                    -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
                    -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
                    """.strip()},
                {"role": "user", "content": f"Task: {row.task}"},
            ]
            for row in df_unique_tasks.itertuples()
        ],
        schema={
            "name": "estimability_classification",
            "schema": {
                "type": "object",
                "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
                "required": ["task_category"],
                "additionalProperties": False
            }
        },
        chunk_size=300,
    )

    if not results or len(results) != len(df_unique_tasks):
        raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")

    classifications = []
    for index, response in enumerate(results):
        task_label = df_unique_tasks.iloc[index]['task']
        task_category_flag = None

        if response is None:
            logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
        else:
            try:
                content_str = response.choices[0].message.content
                if not content_str:
                    raise ValueError("No content found in the response message")

                data = json.loads(content_str)

                if 'task_category' in data and isinstance(data['task_category'], str):
                    task_category_flag = data['task_category']
                else:
                    logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
            except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
                logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")

        classifications.append({
            'task': task_label,
            'estimable': task_category_flag == 'ATOMIC'
        })

    classification_df = pd.DataFrame(classifications)

    logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")

    logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
    classification_df.to_parquet(CACHE_PATH)

    return classification_df


def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
    CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
    if CACHE_PATH.exists() and not bust:
        logger.info(f"Loading cached task estimates from {CACHE_PATH}")
        return pd.read_parquet(CACHE_PATH)

    logger.info("Enriching tasks with time estimates.")

    if df_to_process.empty:
        raise ValueError("No tasks to process for estimates.")

    results = enrich(
        model="gpt-4.1-mini",
        rpm=5000,
        messages_to_process=[
            [
                {
                    "role": "system",
                    "content":  """
                        You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision

                        'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.

                        Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.

                        Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
                },
                {
                    "role": "user",
                    "content":  f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
                }
            ]
            for row in df_to_process.itertuples()
        ],
        schema= {
            "name": "estimate_time",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "lower_bound_estimate": {
                        "type": "object",
                        "properties": {
                            "quantity": {
                                "type": "number",
                                "description": "The numerical value for the lower bound of the estimate.",
                            },
                            "unit": {
                                "type": "string",
                                "enum": ALLOWED_UNITS,
                                "description": "The unit of time for the lower bound.",
                            },
                        },
                        "required": ["quantity", "unit"],
                        "additionalProperties": False,
                    },
                    "upper_bound_estimate": {
                        "type": "object",
                        "properties": {
                            "quantity": {
                                "type": "number",
                                "description": "The numerical value for the upper bound of the estimate.",
                            },
                            "unit": {
                                "type": "string",
                                "enum": ALLOWED_UNITS,
                                "description": "The unit of time for the upper bound.",
                            },
                        },
                        "required": ["quantity", "unit"],
                        "additionalProperties": False,
                    },
                },
                "required": ["lower_bound_estimate", "upper_bound_estimate"],
                "additionalProperties": False,
            },
        },
        chunk_size=200,
    )

    if not results or len(results) != len(df_to_process):
        raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
            f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")

    estimates = []
    for index, response in enumerate(results):
        row = df_to_process.iloc[index]
        task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
        lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None

        if response is None:
            logger.warning(f"API call failed for task (enrich returned None): {task_info}")
        else:
            try:
                content_str = response.choices[0].message.content
                if not content_str:
                    raise ValueError("No content found in the response message")

                data = json.loads(content_str)

                lb_qty = data['lower_bound_estimate']['quantity']
                lb_unit = data['lower_bound_estimate']['unit']
                ub_qty = data['upper_bound_estimate']['quantity']
                ub_unit = data['upper_bound_estimate']['unit']
            except Exception as e:
                logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
                lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure

        estimates.append({
            'onetsoc_code': row.onetsoc_code,
            'task_id': row.task_id,
            'lb_estimate_qty': lb_qty,
            'lb_estimate_unit': lb_unit,
            'ub_estimate_qty': ub_qty,
            'ub_estimate_unit': ub_unit
        })

    estimates_df = pd.DataFrame(estimates)
    logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")

    logger.info(f"Saving task estimates to {CACHE_PATH}")
    estimates_df.to_parquet(CACHE_PATH)

    return estimates_df