wip
This commit is contained in:
parent
62296e1b69
commit
65dc648797
37 changed files with 1413 additions and 2433 deletions
225
pipeline/classification.py
Normal file
225
pipeline/classification.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from .logger import logger
|
||||
from .utils import enrich
|
||||
import json
|
||||
|
||||
ALLOWED_UNITS = [
|
||||
"minute",
|
||||
"hour",
|
||||
"day",
|
||||
"week",
|
||||
"month",
|
||||
"trimester",
|
||||
"semester",
|
||||
"year",
|
||||
]
|
||||
|
||||
ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
|
||||
TIME_ESTIMATES_GENERATION_VERSION = "old_version"
|
||||
|
||||
def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
||||
CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
|
||||
if CACHE_PATH.exists() and not bust:
|
||||
logger.info(f"Loading cached task estimability from {CACHE_PATH}")
|
||||
return pd.read_parquet(CACHE_PATH)
|
||||
|
||||
logger.info("Enriching tasks with estimability classification.")
|
||||
|
||||
df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
|
||||
|
||||
logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
|
||||
|
||||
if df_unique_tasks.empty:
|
||||
raise ValueError("No unique tasks to classify.")
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": """
|
||||
Classify the provided O*NET task into one of these categories:
|
||||
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
|
||||
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
|
||||
""".strip()},
|
||||
{"role": "user", "content": f"Task: {row.task}"},
|
||||
]
|
||||
for row in df_unique_tasks.itertuples()
|
||||
],
|
||||
schema={
|
||||
"name": "estimability_classification",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
|
||||
"required": ["task_category"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
},
|
||||
chunk_size=300,
|
||||
)
|
||||
|
||||
if not results or len(results) != len(df_unique_tasks):
|
||||
raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
|
||||
|
||||
classifications = []
|
||||
for index, response in enumerate(results):
|
||||
task_label = df_unique_tasks.iloc[index]['task']
|
||||
task_category_flag = None
|
||||
|
||||
if response is None:
|
||||
logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
|
||||
else:
|
||||
try:
|
||||
content_str = response.choices[0].message.content
|
||||
if not content_str:
|
||||
raise ValueError("No content found in the response message")
|
||||
|
||||
data = json.loads(content_str)
|
||||
|
||||
if 'task_category' in data and isinstance(data['task_category'], str):
|
||||
task_category_flag = data['task_category']
|
||||
else:
|
||||
logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
|
||||
except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
|
||||
logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
|
||||
|
||||
classifications.append({
|
||||
'task': task_label,
|
||||
'estimable': task_category_flag == 'ATOMIC'
|
||||
})
|
||||
|
||||
classification_df = pd.DataFrame(classifications)
|
||||
|
||||
logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
|
||||
|
||||
logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
|
||||
classification_df.to_parquet(CACHE_PATH)
|
||||
|
||||
return classification_df
|
||||
|
||||
|
||||
def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
||||
CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
|
||||
if CACHE_PATH.exists() and not bust:
|
||||
logger.info(f"Loading cached task estimates from {CACHE_PATH}")
|
||||
return pd.read_parquet(CACHE_PATH)
|
||||
|
||||
logger.info("Enriching tasks with time estimates.")
|
||||
|
||||
if df_to_process.empty:
|
||||
raise ValueError("No tasks to process for estimates.")
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
|
||||
|
||||
'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
|
||||
|
||||
Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
|
||||
|
||||
Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
|
||||
}
|
||||
]
|
||||
for row in df_to_process.itertuples()
|
||||
],
|
||||
schema= {
|
||||
"name": "estimate_time",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the lower bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the lower bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the upper bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the upper bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
chunk_size=200,
|
||||
)
|
||||
|
||||
if not results or len(results) != len(df_to_process):
|
||||
raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
|
||||
f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
|
||||
|
||||
estimates = []
|
||||
for index, response in enumerate(results):
|
||||
row = df_to_process.iloc[index]
|
||||
task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
|
||||
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
|
||||
|
||||
if response is None:
|
||||
logger.warning(f"API call failed for task (enrich returned None): {task_info}")
|
||||
else:
|
||||
try:
|
||||
content_str = response.choices[0].message.content
|
||||
if not content_str:
|
||||
raise ValueError("No content found in the response message")
|
||||
|
||||
data = json.loads(content_str)
|
||||
|
||||
lb_qty = data['lower_bound_estimate']['quantity']
|
||||
lb_unit = data['lower_bound_estimate']['unit']
|
||||
ub_qty = data['upper_bound_estimate']['quantity']
|
||||
ub_unit = data['upper_bound_estimate']['unit']
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
|
||||
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
|
||||
|
||||
estimates.append({
|
||||
'onetsoc_code': row.onetsoc_code,
|
||||
'task_id': row.task_id,
|
||||
'lb_estimate_qty': lb_qty,
|
||||
'lb_estimate_unit': lb_unit,
|
||||
'ub_estimate_qty': ub_qty,
|
||||
'ub_estimate_unit': ub_unit
|
||||
})
|
||||
|
||||
estimates_df = pd.DataFrame(estimates)
|
||||
logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
|
||||
|
||||
logger.info(f"Saving task estimates to {CACHE_PATH}")
|
||||
estimates_df.to_parquet(CACHE_PATH)
|
||||
|
||||
return estimates_df
|
Loading…
Add table
Add a link
Reference in a new issue