wip

2025-07-15 00:34:54 +02:00 · 2025-07-15 00:34:54 +02:00 · 65dc648797
commit 65dc648797
parent 62296e1b69
37 changed files with 1413 additions and 2433 deletions
--- a/pipeline/classification.py
+++ b/pipeline/classification.py
@ -0,0 +1,225 @@
+from pathlib import Path
+import pandas as pd
+from .logger import logger
+from .utils import enrich
+import json
+
+ALLOWED_UNITS = [
+    "minute",
+    "hour",
+    "day",
+    "week",
+    "month",
+    "trimester",
+    "semester",
+    "year",
+]
+
+ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
+TIME_ESTIMATES_GENERATION_VERSION = "old_version"
+
+def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
+    CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
+    if CACHE_PATH.exists() and not bust:
+        logger.info(f"Loading cached task estimability from {CACHE_PATH}")
+        return pd.read_parquet(CACHE_PATH)
+
+    logger.info("Enriching tasks with estimability classification.")
+
+    df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
+
+    logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
+
+    if df_unique_tasks.empty:
+        raise ValueError("No unique tasks to classify.")
+
+    results = enrich(
+        model="gpt-4.1-mini",
+        rpm=5000,
+        messages_to_process=[
+            [
+                {"role": "system", "content":  """
+                    Classify the provided O*NET task into one of these categories:
+                    -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
+                    -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
+                    """.strip()},
+                {"role": "user", "content": f"Task: {row.task}"},
+            ]
+            for row in df_unique_tasks.itertuples()
+        ],
+        schema={
+            "name": "estimability_classification",
+            "schema": {
+                "type": "object",
+                "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
+                "required": ["task_category"],
+                "additionalProperties": False
+            }
+        },
+        chunk_size=300,
+    )
+
+    if not results or len(results) != len(df_unique_tasks):
+        raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
+
+    classifications = []
+    for index, response in enumerate(results):
+        task_label = df_unique_tasks.iloc[index]['task']
+        task_category_flag = None
+
+        if response is None:
+            logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
+        else:
+            try:
+                content_str = response.choices[0].message.content
+                if not content_str:
+                    raise ValueError("No content found in the response message")
+
+                data = json.loads(content_str)
+
+                if 'task_category' in data and isinstance(data['task_category'], str):
+                    task_category_flag = data['task_category']
+                else:
+                    logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
+            except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
+                logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
+
+        classifications.append({
+            'task': task_label,
+            'estimable': task_category_flag == 'ATOMIC'
+        })
+
+    classification_df = pd.DataFrame(classifications)
+
+    logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
+
+    logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
+    classification_df.to_parquet(CACHE_PATH)
+
+    return classification_df
+
+
+def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
+    CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
+    if CACHE_PATH.exists() and not bust:
+        logger.info(f"Loading cached task estimates from {CACHE_PATH}")
+        return pd.read_parquet(CACHE_PATH)
+
+    logger.info("Enriching tasks with time estimates.")
+
+    if df_to_process.empty:
+        raise ValueError("No tasks to process for estimates.")
+
+    results = enrich(
+        model="gpt-4.1-mini",
+        rpm=5000,
+        messages_to_process=[
+            [
+                {
+                    "role": "system",
+                    "content":  """
+                        You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
+
+                        'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
+
+                        Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
+
+                        Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
+                },
+                {
+                    "role": "user",
+                    "content":  f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
+                }
+            ]
+            for row in df_to_process.itertuples()
+        ],
+        schema= {
+            "name": "estimate_time",
+            "strict": True,
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "lower_bound_estimate": {
+                        "type": "object",
+                        "properties": {
+                            "quantity": {
+                                "type": "number",
+                                "description": "The numerical value for the lower bound of the estimate.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ALLOWED_UNITS,
+                                "description": "The unit of time for the lower bound.",
+                            },
+                        },
+                        "required": ["quantity", "unit"],
+                        "additionalProperties": False,
+                    },
+                    "upper_bound_estimate": {
+                        "type": "object",
+                        "properties": {
+                            "quantity": {
+                                "type": "number",
+                                "description": "The numerical value for the upper bound of the estimate.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ALLOWED_UNITS,
+                                "description": "The unit of time for the upper bound.",
+                            },
+                        },
+                        "required": ["quantity", "unit"],
+                        "additionalProperties": False,
+                    },
+                },
+                "required": ["lower_bound_estimate", "upper_bound_estimate"],
+                "additionalProperties": False,
+            },
+        },
+        chunk_size=200,
+    )
+
+    if not results or len(results) != len(df_to_process):
+        raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
+            f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
+
+    estimates = []
+    for index, response in enumerate(results):
+        row = df_to_process.iloc[index]
+        task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
+        lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
+
+        if response is None:
+            logger.warning(f"API call failed for task (enrich returned None): {task_info}")
+        else:
+            try:
+                content_str = response.choices[0].message.content
+                if not content_str:
+                    raise ValueError("No content found in the response message")
+
+                data = json.loads(content_str)
+
+                lb_qty = data['lower_bound_estimate']['quantity']
+                lb_unit = data['lower_bound_estimate']['unit']
+                ub_qty = data['upper_bound_estimate']['quantity']
+                ub_unit = data['upper_bound_estimate']['unit']
+            except Exception as e:
+                logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
+                lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
+
+        estimates.append({
+            'onetsoc_code': row.onetsoc_code,
+            'task_id': row.task_id,
+            'lb_estimate_qty': lb_qty,
+            'lb_estimate_unit': lb_unit,
+            'ub_estimate_qty': ub_qty,
+            'ub_estimate_unit': ub_unit
+        })
+
+    estimates_df = pd.DataFrame(estimates)
+    logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
+
+    logger.info(f"Saving task estimates to {CACHE_PATH}")
+    estimates_df.to_parquet(CACHE_PATH)
+
+    return estimates_df