wip
|  | @ -1,2 +1,3 @@ | |||
| - I use Nix. To run a command, prefix them with `nix develop .#impure -c` | ||||
| - I use uv. To add a package, use: uv add. To run a script use: uv run path/to/script | ||||
| - To run the pipeline: `uv run -m pipeline.runner` | ||||
|  |  | |||
							
								
								
									
										
											BIN
										
									
								
								dist/estimate_distribution_histplot.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 75 KiB | 
							
								
								
									
										
											BIN
										
									
								
								dist/estimates_lower_vs_upper_scatter.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 295 KiB | 
							
								
								
									
										
											BIN
										
									
								
								dist/estimates_spread_per_occupation.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 279 KiB | 
							
								
								
									
										
											BIN
										
									
								
								dist/intermediate/df_tasks.parquet
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								dist/intermediate/estimable_tasks_with_estimates.parquet
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								dist/intermediate/task_summary_by_major_occupation.parquet
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								dist/intermediate/task_summary_by_occupation.parquet
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								dist/projected_automatable_wage_bill_sensitivity.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 239 KiB | 
							
								
								
									
										
											BIN
										
									
								
								dist/projected_task_automation_p50.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 145 KiB | 
							
								
								
									
										
											BIN
										
									
								
								dist/projected_task_automation_p80.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 136 KiB | 
							
								
								
									
										
											BIN
										
									
								
								dist/sequential_coherence_cdf.png
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 145 KiB | 
|  | @ -1,507 +0,0 @@ | |||
| import pandas as pd | ||||
| import litellm | ||||
| import dotenv | ||||
| import os | ||||
| import time | ||||
| import json | ||||
| import math | ||||
| import numpy as np | ||||
| 
 | ||||
| # --- Configuration --- | ||||
| MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output | ||||
| RATE_LIMIT = 5000  # Requests per minute | ||||
| CHUNK_SIZE = 300 | ||||
| SECONDS_PER_MINUTE = 60 | ||||
| FILENAME = ( | ||||
|     "tasks_with_estimates.csv"  # This CSV should contain the tasks to be processed | ||||
| ) | ||||
| 
 | ||||
| # --- Prompts and Schema --- | ||||
| SYSTEM_PROMPT = """ | ||||
| You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision. | ||||
| 
 | ||||
| Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. | ||||
| 
 | ||||
| Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year. | ||||
| """.strip() | ||||
| 
 | ||||
| USER_MESSAGE_TEMPLATE = """ | ||||
| Please estimate the time range for the following remote task: | ||||
| 
 | ||||
| **Task Description:** {task} | ||||
| **Relevant activies for the task:** | ||||
| {dwas} | ||||
| 
 | ||||
| **Occupation Category:** {occupation_title} | ||||
| **Occupation Description:** {occupation_description} | ||||
| 
 | ||||
| Consider the complexity and the typical steps involved. | ||||
| """.strip() | ||||
| 
 | ||||
| ALLOWED_UNITS = [ | ||||
|     "minute", | ||||
|     "hour", | ||||
|     "day", | ||||
|     "week", | ||||
|     "month", | ||||
|     "trimester", | ||||
|     "semester", | ||||
|     "year", | ||||
| ] | ||||
| 
 | ||||
| SCHEMA_FOR_VALIDATION = { | ||||
|     "name": "estimate_time", | ||||
|     "strict": True,  # Enforce schema adherence | ||||
|     "schema": { | ||||
|         "type": "object", | ||||
|         "properties": { | ||||
|             "lower_bound_estimate": { | ||||
|                 "type": "object", | ||||
|                 "properties": { | ||||
|                     "quantity": { | ||||
|                         "type": "number", | ||||
|                         "description": "The numerical value for the lower bound of the estimate.", | ||||
|                     }, | ||||
|                     "unit": { | ||||
|                         "type": "string", | ||||
|                         "enum": ALLOWED_UNITS, | ||||
|                         "description": "The unit of time for the lower bound.", | ||||
|                     }, | ||||
|                 }, | ||||
|                 "required": ["quantity", "unit"], | ||||
|                 "additionalProperties": False, | ||||
|             }, | ||||
|             "upper_bound_estimate": { | ||||
|                 "type": "object", | ||||
|                 "properties": { | ||||
|                     "quantity": { | ||||
|                         "type": "number", | ||||
|                         "description": "The numerical value for the upper bound of the estimate.", | ||||
|                     }, | ||||
|                     "unit": { | ||||
|                         "type": "string", | ||||
|                         "enum": ALLOWED_UNITS, | ||||
|                         "description": "The unit of time for the upper bound.", | ||||
|                     }, | ||||
|                 }, | ||||
|                 "required": ["quantity", "unit"], | ||||
|                 "additionalProperties": False, | ||||
|             }, | ||||
|         }, | ||||
|         "required": ["lower_bound_estimate", "upper_bound_estimate"], | ||||
|         "additionalProperties": False, | ||||
|     }, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| def save_dataframe(df_to_save, filename): | ||||
| 
 | ||||
|     """Saves the DataFrame to the specified CSV file using atomic write.""" | ||||
|     try: | ||||
|         temp_filename = filename + ".tmp" | ||||
|         df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) | ||||
|         os.replace(temp_filename, filename) | ||||
|     except Exception as e: | ||||
|         print(f"--- Error saving DataFrame to {filename}: {e} ---") | ||||
|         if os.path.exists(temp_filename): | ||||
|             try: | ||||
|                 os.remove(temp_filename) | ||||
|             except Exception as remove_err: | ||||
|                 print( | ||||
|                     f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" | ||||
|                 ) | ||||
| 
 | ||||
| def create_task_estimates(): | ||||
|     try: | ||||
|         # Read the CSV | ||||
|         if os.path.exists(FILENAME): | ||||
|             df = pd.read_csv(FILENAME, encoding="utf-8-sig") | ||||
|             print(f"Successfully read {len(df)} rows from {FILENAME}.") | ||||
| 
 | ||||
|             estimate_columns_spec = { | ||||
|                 "lb_estimate_qty": float, | ||||
|                 "lb_estimate_unit": object, | ||||
|                 "ub_estimate_qty": float, | ||||
|                 "ub_estimate_unit": object, | ||||
|             } | ||||
|             save_needed = False | ||||
| 
 | ||||
|             for col_name, target_dtype in estimate_columns_spec.items(): | ||||
|                 if col_name not in df.columns: | ||||
|                     # Initialize with a type-compatible missing value | ||||
|                     if target_dtype == float: | ||||
|                         df[col_name] = np.nan | ||||
|                     else:  # object | ||||
|                         df[col_name] = pd.NA | ||||
|                     df[col_name] = df[col_name].astype(target_dtype)  # Enforce dtype | ||||
|                     print(f"Added '{col_name}' column as {df[col_name].dtype}.") | ||||
|                     save_needed = True | ||||
|                 else: | ||||
|                     # Column exists, ensure correct dtype | ||||
|                     current_pd_dtype = df[col_name].dtype | ||||
|                     expected_pd_dtype = pd.Series(dtype=target_dtype).dtype | ||||
| 
 | ||||
|                     if current_pd_dtype != expected_pd_dtype: | ||||
|                         try: | ||||
|                             if target_dtype == float: | ||||
|                                 df[col_name] = pd.to_numeric(df[col_name], errors="coerce") | ||||
|                             else:  # object | ||||
|                                 df[col_name] = df[col_name].astype(object) | ||||
|                             print( | ||||
|                                 f"Corrected dtype of '{col_name}' to {df[col_name].dtype}." | ||||
|                             ) | ||||
|                             save_needed = True | ||||
|                         except Exception as e: | ||||
|                             print( | ||||
|                                 f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}" | ||||
|                             ) | ||||
| 
 | ||||
|                 # Standardize missing values (e.g., empty strings to NA/NaN) | ||||
|                 # Replace common missing placeholders with pd.NA first | ||||
|                 df[col_name].replace(["", None, ""], pd.NA, inplace=True) | ||||
|                 if target_dtype == float: | ||||
|                     # For float columns, ensure they are numeric and use np.nan after replacement | ||||
|                     df[col_name] = pd.to_numeric(df[col_name], errors="coerce") | ||||
| 
 | ||||
|             if save_needed: | ||||
|                 print(f"Saving {FILENAME} after adding/adjusting estimate columns.") | ||||
|                 save_dataframe(df, FILENAME) | ||||
|         else: | ||||
|             print( | ||||
|                 f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." | ||||
|             ) | ||||
|             exit() | ||||
|     except FileNotFoundError: | ||||
|         print( | ||||
|             f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." | ||||
|         ) | ||||
|         exit() | ||||
|     except Exception as e: | ||||
|         print(f"Error reading or initializing {FILENAME}: {e}") | ||||
|         exit() | ||||
| 
 | ||||
|     # --- Identify Rows to Process --- | ||||
|     # We'll check for NaN in one of the primary quantity columns. | ||||
|     unprocessed_mask = df["lb_estimate_qty"].isna() | ||||
|     if unprocessed_mask.any(): | ||||
|         start_index = unprocessed_mask.idxmax()  # Finds the index of the first True value | ||||
|         print(f"Resuming processing. First unprocessed row found at index {start_index}.") | ||||
|         df_to_process = df.loc[unprocessed_mask].copy() | ||||
|         original_indices = df_to_process.index  # Keep track of original indices | ||||
|     else: | ||||
|         print( | ||||
|             "All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting." | ||||
|         ) | ||||
|         exit() | ||||
| 
 | ||||
| 
 | ||||
|     # --- Prepare messages for batch completion (only for rows needing processing) --- | ||||
|     messages_list = [] | ||||
|     skipped_rows_indices = [] | ||||
|     valid_original_indices = [] | ||||
| 
 | ||||
|     if not df_to_process.empty: | ||||
|         required_cols = ["task", "occupation_title", "occupation_description", "dwas"] | ||||
|         print( | ||||
|             f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..." | ||||
|         ) | ||||
|         print(f"Checking for required columns: {required_cols}") | ||||
| 
 | ||||
|         for index, row in df_to_process.iterrows(): | ||||
|             missing_or_empty = [] | ||||
|             for col in required_cols: | ||||
|                 if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "": | ||||
|                     missing_or_empty.append(col) | ||||
| 
 | ||||
|             if missing_or_empty: | ||||
|                 print( | ||||
|                     f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}." | ||||
|                 ) | ||||
|                 skipped_rows_indices.append(index) | ||||
|                 continue | ||||
| 
 | ||||
|             try: | ||||
|                 user_message = USER_MESSAGE_TEMPLATE.format( | ||||
|                     task=row["task"], | ||||
|                     occupation_title=row["occupation_title"], | ||||
|                     occupation_description=row["occupation_description"], | ||||
|                     dwas=row["dwas"], | ||||
|                 ) | ||||
|             except KeyError as e: | ||||
|                 print( | ||||
|                     f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns." | ||||
|                 ) | ||||
|                 skipped_rows_indices.append(index) | ||||
|                 continue | ||||
| 
 | ||||
|             messages_for_row = [ | ||||
|                 {"role": "system", "content": SYSTEM_PROMPT}, | ||||
|                 {"role": "user", "content": user_message}, | ||||
|             ] | ||||
|             messages_list.append(messages_for_row) | ||||
|             valid_original_indices.append(index)  # This is the original DataFrame index | ||||
| 
 | ||||
|         print( | ||||
|             f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)." | ||||
|         ) | ||||
|         if not messages_list: | ||||
|             print("No valid rows found to process after checking required data. Exiting.") | ||||
|             exit() | ||||
|     else: | ||||
|         print( | ||||
|             "No rows found needing processing (df_to_process is empty)." | ||||
|         )  # Should have been caught by earlier check | ||||
|         exit() | ||||
| 
 | ||||
| 
 | ||||
|     # --- Call batch_completion in chunks with rate limiting and periodic saving --- | ||||
|     total_messages_to_send = len(messages_list) | ||||
|     num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE) | ||||
| 
 | ||||
|     print( | ||||
|         f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..." | ||||
|     ) | ||||
| 
 | ||||
|     overall_start_time = time.time() | ||||
|     processed_count_total = 0 | ||||
| 
 | ||||
|     for i in range(num_chunks): | ||||
|         chunk_start_message_index = i * CHUNK_SIZE | ||||
|         chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send) | ||||
|         message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] | ||||
|         # Get corresponding original DataFrame indices for this chunk | ||||
|         chunk_original_indices = valid_original_indices[ | ||||
|             chunk_start_message_index:chunk_end_message_index | ||||
|         ] | ||||
| 
 | ||||
|         if not message_chunk: | ||||
|             continue | ||||
| 
 | ||||
|         min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A" | ||||
|         max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A" | ||||
|         print( | ||||
|             f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." | ||||
|             f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}" | ||||
|         ) | ||||
|         chunk_start_time = time.time() | ||||
|         responses = [] | ||||
|         try: | ||||
|             print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...") | ||||
|             responses = litellm.batch_completion( | ||||
|                 model=MODEL, | ||||
|                 messages=message_chunk, | ||||
|                 response_format={ | ||||
|                     "type": "json_schema", | ||||
|                     "json_schema": SCHEMA_FOR_VALIDATION, | ||||
|                 }, | ||||
|                 num_retries=3, | ||||
|                 # request_timeout=60 # Optional: uncomment if needed | ||||
|             ) | ||||
|             print(f"Chunk {i + 1} API call completed.") | ||||
| 
 | ||||
|         except Exception as e: | ||||
|             print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") | ||||
|             responses = [None] * len( | ||||
|                 message_chunk | ||||
|             )  # Ensure responses list matches message_chunk length for processing loop | ||||
| 
 | ||||
|         # --- Process responses for the current chunk --- | ||||
|         chunk_updates = {}  # To store {original_df_index: {qty/unit data}} | ||||
|         successful_in_chunk = 0 | ||||
|         failed_in_chunk = 0 | ||||
| 
 | ||||
|         if responses and len(responses) == len(message_chunk): | ||||
|             for j, response in enumerate(responses): | ||||
|                 original_df_index = chunk_original_indices[j] | ||||
| 
 | ||||
|                 # Initialize values for this item | ||||
|                 lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None | ||||
|                 content_str = None | ||||
| 
 | ||||
|                 if response is None: | ||||
|                     print( | ||||
|                         f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)." | ||||
|                     ) | ||||
|                     failed_in_chunk += 1 | ||||
|                     continue | ||||
| 
 | ||||
|                 try: | ||||
|                     if ( | ||||
|                         response.choices | ||||
|                         and response.choices[0].message | ||||
|                         and response.choices[0].message.content | ||||
|                     ): | ||||
|                         content_str = response.choices[0].message.content | ||||
|                         estimate_data = json.loads(content_str)  # Can raise JSONDecodeError | ||||
| 
 | ||||
|                         lower_bound_dict = estimate_data.get("lower_bound_estimate") | ||||
|                         upper_bound_dict = estimate_data.get("upper_bound_estimate") | ||||
| 
 | ||||
|                         valid_response_structure = isinstance( | ||||
|                             lower_bound_dict, dict | ||||
|                         ) and isinstance(upper_bound_dict, dict) | ||||
| 
 | ||||
|                         if valid_response_structure: | ||||
|                             lb_qty_raw = lower_bound_dict.get("quantity") | ||||
|                             lb_unit_raw = lower_bound_dict.get("unit") | ||||
|                             ub_qty_raw = upper_bound_dict.get("quantity") | ||||
|                             ub_unit_raw = upper_bound_dict.get("unit") | ||||
| 
 | ||||
|                             is_valid_item = True | ||||
|                             # Validate LB Qty | ||||
|                             if ( | ||||
|                                 not isinstance(lb_qty_raw, (int, float)) | ||||
|                                 or math.isnan(float(lb_qty_raw)) | ||||
|                                 or float(lb_qty_raw) < 0 | ||||
|                             ): | ||||
|                                 print( | ||||
|                                     f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}" | ||||
|                                 ) | ||||
|                                 is_valid_item = False | ||||
|                             else: | ||||
|                                 lb_qty_val = float(lb_qty_raw) | ||||
| 
 | ||||
|                             # Validate UB Qty | ||||
|                             if ( | ||||
|                                 not isinstance(ub_qty_raw, (int, float)) | ||||
|                                 or math.isnan(float(ub_qty_raw)) | ||||
|                                 or float(ub_qty_raw) < 0 | ||||
|                             ): | ||||
|                                 print( | ||||
|                                     f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}" | ||||
|                                 ) | ||||
|                                 is_valid_item = False | ||||
|                             else: | ||||
|                                 ub_qty_val = float(ub_qty_raw) | ||||
| 
 | ||||
|                             # Validate Units | ||||
|                             if lb_unit_raw not in ALLOWED_UNITS: | ||||
|                                 print( | ||||
|                                     f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'" | ||||
|                                 ) | ||||
|                                 is_valid_item = False | ||||
|                             else: | ||||
|                                 lb_unit_val = lb_unit_raw | ||||
| 
 | ||||
|                             if ub_unit_raw not in ALLOWED_UNITS: | ||||
|                                 print( | ||||
|                                     f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'" | ||||
|                                 ) | ||||
|                                 is_valid_item = False | ||||
|                             else: | ||||
|                                 ub_unit_val = ub_unit_raw | ||||
| 
 | ||||
|                             if is_valid_item: | ||||
|                                 successful_in_chunk += 1 | ||||
|                                 chunk_updates[original_df_index] = { | ||||
|                                     "lb_estimate_qty": lb_qty_val, | ||||
|                                     "lb_estimate_unit": lb_unit_val, | ||||
|                                     "ub_estimate_qty": ub_qty_val, | ||||
|                                     "ub_estimate_unit": ub_unit_val, | ||||
|                                 } | ||||
|                             else: | ||||
|                                 failed_in_chunk += ( | ||||
|                                     1  # Values remain None if not fully valid | ||||
|                                 ) | ||||
|                         else: | ||||
|                             print( | ||||
|                                 f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'" | ||||
|                             ) | ||||
|                             failed_in_chunk += 1 | ||||
|                     else: | ||||
|                         finish_reason = ( | ||||
|                             response.choices[0].finish_reason | ||||
|                             if (response.choices and response.choices[0].finish_reason) | ||||
|                             else "unknown" | ||||
|                         ) | ||||
|                         error_message = ( | ||||
|                             response.choices[0].message.content | ||||
|                             if ( | ||||
|                                 response.choices | ||||
|                                 and response.choices[0].message | ||||
|                                 and response.choices[0].message.content | ||||
|                             ) | ||||
|                             else "No content in message." | ||||
|                         ) | ||||
|                         print( | ||||
|                             f"Warning: Received non-standard or empty response content for original index {original_df_index}. " | ||||
|                             f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" | ||||
|                         ) | ||||
|                         failed_in_chunk += 1 | ||||
| 
 | ||||
|                 except json.JSONDecodeError: | ||||
|                     print( | ||||
|                         f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'" | ||||
|                     ) | ||||
|                     failed_in_chunk += 1 | ||||
|                 except AttributeError as ae: | ||||
|                     print( | ||||
|                         f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}" | ||||
|                     ) | ||||
|                     failed_in_chunk += 1 | ||||
|                 except Exception as e: | ||||
|                     print( | ||||
|                         f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}" | ||||
|                     ) | ||||
|                     failed_in_chunk += 1 | ||||
|         else: | ||||
|             print( | ||||
|                 f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) " | ||||
|                 f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed." | ||||
|             ) | ||||
|             failed_in_chunk = len( | ||||
|                 message_chunk | ||||
|             )  # All items in this chunk are considered failed if response array is problematic | ||||
| 
 | ||||
|         print( | ||||
|             f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}" | ||||
|         ) | ||||
|         processed_count_total += successful_in_chunk | ||||
| 
 | ||||
|         # --- Update Main DataFrame and Save Periodically --- | ||||
|         if chunk_updates: | ||||
|             print( | ||||
|                 f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..." | ||||
|             ) | ||||
|             for idx, estimates in chunk_updates.items(): | ||||
|                 if idx in df.index: | ||||
|                     df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"] | ||||
|                     df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"] | ||||
|                     df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"] | ||||
|                     df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"] | ||||
| 
 | ||||
|             print(f"Saving progress to {FILENAME}...") | ||||
|             save_dataframe(df, FILENAME) | ||||
|         else: | ||||
|             print(f"No successful estimates obtained in chunk {i + 1} to save.") | ||||
| 
 | ||||
|         # --- Rate Limiting Pause --- | ||||
|         chunk_end_time = time.time() | ||||
|         chunk_duration = chunk_end_time - chunk_start_time | ||||
|         print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.") | ||||
| 
 | ||||
|         if i < num_chunks - 1:  # No pause after the last chunk | ||||
|             # Calculate ideal time per request based on rate limit | ||||
|             time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 | ||||
|             # Calculate minimum duration this chunk should have taken to respect rate limit | ||||
|             min_chunk_duration_for_rate = len(message_chunk) * time_per_request | ||||
|             # Calculate pause needed | ||||
|             pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) | ||||
| 
 | ||||
|             if pause_needed > 0: | ||||
|                 print( | ||||
|                     f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." | ||||
|                 ) | ||||
|                 time.sleep(pause_needed) | ||||
| 
 | ||||
|     overall_end_time = time.time() | ||||
|     total_duration_minutes = (overall_end_time - overall_start_time) / 60 | ||||
|     print( | ||||
|         f"\nBatch completion finished." | ||||
|         f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes." | ||||
|     ) | ||||
| 
 | ||||
|     print(f"Performing final save to {FILENAME}...") | ||||
|     save_dataframe(df, FILENAME) | ||||
| 
 | ||||
|     print("\nScript finished.") | ||||
							
								
								
									
										528
									
								
								old/analysis.py
									
										
									
									
									
								
							
							
						
						|  | @ -1,528 +0,0 @@ | |||
| import os | ||||
| import litellm | ||||
| import sqlite3 | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from google.colab import userdata, files | ||||
| import seaborn as sns | ||||
| import matplotlib.pyplot as plt | ||||
| import matplotlib as mpl | ||||
| 
 | ||||
| os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') | ||||
| os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY') | ||||
| 
 | ||||
| occupation_major_codes = { | ||||
|     '11': 'Management', | ||||
|     '13': 'Business and Financial Operations', | ||||
|     '15': 'Computer and Mathematical Occupations', | ||||
|     '17': 'Architecture and Engineering', | ||||
|     '19': 'Life, Physical, and Social Science', | ||||
|     '21': 'Community and Social Services', | ||||
|     '23': 'Legal', | ||||
|     '25': 'Education, Training, and Library', | ||||
|     '27': 'Arts, Design, Entertainment, Sports, and Media', | ||||
|     '29': 'Healthcare Practitioners and Technical', | ||||
|     '31': 'Healthcare Support', | ||||
|     '33': 'Protective Service', | ||||
|     '35': 'Food Preparation and Serving Related', | ||||
|     '37': 'Building and Grounds Cleaning and Maintenance', | ||||
|     '39': 'Personal Care and Service', | ||||
|     '41': 'Sales and Related', | ||||
|     '43': 'Office and Administrative Support', | ||||
|     '45': 'Farming, Fishing, and Forestry', | ||||
|     '47': 'Construction and Extraction', | ||||
|     '49': 'Installation, Maintenance, and Repair', | ||||
|     '51': 'Production', | ||||
|     '53': 'Transportation and Material Moving', | ||||
|     '55': 'Military Specific' | ||||
| } | ||||
| 
 | ||||
| gray   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', | ||||
|                    '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', | ||||
|                    '600':'#475569','700':'#334155','800':'#1e293b', | ||||
|                    '900':'#0f172a','950':'#020617'} | ||||
| lime            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', | ||||
|                    '300': '#bbf451','400': '#9ae600','500': '#83cd00', | ||||
|                    '600': '#64a400','700': '#497d00','800': '#3c6300', | ||||
|                    '900': '#35530e','950': '#192e03'} | ||||
| 
 | ||||
| mpl.rcParams.update({ | ||||
|     'figure.facecolor' : gray['50'], | ||||
|     'axes.facecolor'   : gray['50'], | ||||
|     'axes.edgecolor'   : gray['100'], | ||||
|     'axes.labelcolor'  : gray['700'], | ||||
|     'xtick.color'      : gray['700'], | ||||
|     'ytick.color'      : gray['700'], | ||||
|     'font.family'      : 'Inter',  # falls back to DejaVu if Inter not present | ||||
|     'font.size'        : 11, | ||||
| }) | ||||
| 
 | ||||
| sns.set_style("white")         # keep minimal axes, we will remove default grid | ||||
| sns.set_context("notebook") | ||||
| 
 | ||||
| def prepare_tasks(): | ||||
|     # This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work | ||||
|     # It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o) | ||||
|     # You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing | ||||
|     df_remote_status = pd.read_csv("epoch_task_data.csv") | ||||
| 
 | ||||
|     # BLS OEWS: Https://www.bls.gov/oes/special-requests/oesm23nat.zip | ||||
|     df_oesm = pd.read_excel("oesm23national.xlsx") | ||||
| 
 | ||||
|     # Run uv run ./enrich_task_ratings.py | ||||
|     df_tasks = pd.read_json("task_ratings_enriched.json") | ||||
| 
 | ||||
|     # Run uv run classify_estimateability_of_tasks.py | ||||
|     df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first') | ||||
| 
 | ||||
|     # df_tasks now has a remote_status column which contains either "remote" or "not remote" | ||||
|     df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left') | ||||
|     df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) | ||||
| 
 | ||||
|     # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT" | ||||
|     df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left') | ||||
| 
 | ||||
|     df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy() | ||||
| 
 | ||||
|     df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2] | ||||
| 
 | ||||
|     df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy() | ||||
| 
 | ||||
|     # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv | ||||
| 
 | ||||
| def preprocessing_time_estimates(): | ||||
|     df = pd.read_csv("tasks_with_estimates.csv") | ||||
| 
 | ||||
|     df = df[df['importance_average'] > 3].copy() | ||||
| 
 | ||||
|     # The embeddings comes from running `uv run ./embed_task_description.py` | ||||
|     # Columns: ['embedding_id', 'task', 'embedding_vector'] | ||||
|     # These contain embedding for UNIQUE tasks | ||||
|     df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy() | ||||
| 
 | ||||
|     df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left') | ||||
|     df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left') | ||||
| 
 | ||||
|     df['onetsoc_major'] = df['onetsoc_code'].str[:2] | ||||
| 
 | ||||
|     def convert_to_minutes(qty, unit): | ||||
|         """Converts a quantity in a given unit to minutes.""" | ||||
|         return qty * { | ||||
|             "minute": 1, | ||||
|             "hour": 60, | ||||
|             "day": 60 * 24, | ||||
|             "week": 60 * 24 * 7, | ||||
|             "month": 60 * 24 * 30, | ||||
|             "trimester": 60 * 24 * 90, | ||||
|             "semester": 60 * 24 * 180, | ||||
|             "year": 60 * 24 * 365, | ||||
|         }[unit] | ||||
| 
 | ||||
|     df['lb_estimate_in_minutes'] = df.apply( | ||||
|         lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1 | ||||
|     ) | ||||
|     df['ub_estimate_in_minutes'] = df.apply( | ||||
|         lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1 | ||||
|     ) | ||||
| 
 | ||||
|     df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes | ||||
|     df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes | ||||
|     df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2 | ||||
| 
 | ||||
|     atomic_tasks = df[df['estimateable'] == 'ATOMIC'] | ||||
|     ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT'] | ||||
| 
 | ||||
|     with pd.option_context('display.max_columns', None): | ||||
|       display(df) | ||||
| 
 | ||||
|     # Check for empty estimates | ||||
|     if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0: | ||||
|         print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum()) | ||||
| 
 | ||||
|     if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0: | ||||
|         print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum()) | ||||
| 
 | ||||
|     # Check for impossible bounds | ||||
|     impossible_bounds = atomic_tasks[ | ||||
|         (atomic_tasks['lb_estimate_in_minutes'] <= 0) | | ||||
|         (atomic_tasks['ub_estimate_in_minutes'] <= 0) | | ||||
|         (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes']) | ||||
|     ] | ||||
|     if not impossible_bounds.empty: | ||||
|         print(f"Error: Found rows with impossible bounds.") | ||||
|         with pd.option_context('display.max_colwidth', None): | ||||
|         display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']]) | ||||
| 
 | ||||
|     #with pd.option_context('display.max_colwidth', None): | ||||
|         #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']]) | ||||
| 
 | ||||
| def cell1(): | ||||
|     sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True) | ||||
| 
 | ||||
| def cell2(): | ||||
|     plt.figure(figsize=(14,10)) | ||||
|     sns.boxplot( | ||||
|         data=atomic_tasks, | ||||
|         x='onetsoc_major',           # 11 = Management, 15 = Computer/Math, … | ||||
|         y='estimate_range', | ||||
|         showfliers=False | ||||
|     ) | ||||
|     plt.yscale('log')                # long tail => log scale | ||||
|     plt.xlabel('Occupation') | ||||
|     plt.ylabel('Range (upper-lower, minutes)') | ||||
|     plt.title('Spread of time-range estimates per occupation') | ||||
| 
 | ||||
|     ax = plt.gca() | ||||
|     ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right') | ||||
| 
 | ||||
| def cell3(): | ||||
|     plt.figure(figsize=(10, 10)) | ||||
|     ax = sns.scatterplot( | ||||
|             data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}),  # Replace codes with labels | ||||
|             x='lb_estimate_in_minutes', y='ub_estimate_in_minutes', | ||||
|             alpha=0.2, edgecolor=None, hue="onetsoc_major"  # Use the labeled column for hue | ||||
|         ) | ||||
| 
 | ||||
|     # 45° reference | ||||
|     lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max()) | ||||
|     ax.plot(lims, lims, color='black', linestyle='--', linewidth=1) | ||||
| 
 | ||||
|     # optional helper lines: 2× and 10×, 100× ratios | ||||
|     for k in [2,10, 100]: | ||||
|         ax.plot(lims, [k*l for l in lims], | ||||
|                 linestyle=':', color='grey', linewidth=1) | ||||
| 
 | ||||
|     ax.set(xscale='log', yscale='log') | ||||
|     ax.set_xlabel('Lower-bound (min, log scale)') | ||||
|     ax.set_ylabel('Upper-bound (min, log scale)') | ||||
|     ax.set_title('Lower vs upper estimates for all tasks') | ||||
| 
 | ||||
|     # Place the legend outside the plot | ||||
|     ax.legend(bbox_to_anchor=(1, 1), loc='upper left') | ||||
| 
 | ||||
| def cell4(): | ||||
|     plt.figure(figsize=(8,4)) | ||||
|     sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()), | ||||
|                 bins=60, kde=True) | ||||
|     plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×') | ||||
|     plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×') | ||||
|     plt.axvline(0, color='black', ls='-', lw=1)          # ub = lb | ||||
|     plt.xlabel('log₁₀(upper / lower)') | ||||
|     plt.ylabel('Count') | ||||
|     plt.title('Distribution of upper:lower ratio') | ||||
|     plt.legend() | ||||
|     plt.tight_layout() | ||||
| 
 | ||||
| 
 | ||||
| def cell5(): | ||||
|     # 1. Bin lower bounds into quartiles (Q1–Q4) | ||||
|     atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes, | ||||
|                         q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest']) | ||||
| 
 | ||||
| 
 | ||||
|     # 3. Aggregate: median (or mean) ratio per cell | ||||
|     pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q', | ||||
|                         values='estimate_ratio', aggfunc='median') | ||||
| 
 | ||||
|     # Map the index (onetsoc_major codes) to their corresponding labels | ||||
|     pivot.index = pivot.index.map(occupation_major_codes) | ||||
| 
 | ||||
| 
 | ||||
|     # 4. Visualise | ||||
|     plt.figure(figsize=(10,8)) | ||||
|     sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f', | ||||
|                 cbar_kws={'label':'Median upper/lower ratio'}) | ||||
|     plt.xlabel('Lower-bound quartile') | ||||
|     plt.ylabel('Occupation (major group)') | ||||
|     plt.title('Typical range width by occupation and task length') | ||||
|     plt.tight_layout() | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def cell6(): | ||||
|     """ | ||||
|     from scipy.stats import median_abs_deviation | ||||
| 
 | ||||
|     def mad_z(series): | ||||
|         med = series.median() | ||||
|         mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ | ||||
|         return (series - med) / mad | ||||
| 
 | ||||
|     df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) | ||||
|     """ | ||||
| 
 | ||||
|     agg = (atomic_tasks | ||||
|            .groupby('onetsoc_code')['estimate_midpoint'] | ||||
|            .agg(median='median', | ||||
|                 q1=lambda x: x.quantile(.25), | ||||
|                 q3=lambda x: x.quantile(.75), | ||||
|                 mean='mean', | ||||
|                 std='std') | ||||
|            .reset_index()) | ||||
|     agg['IQR'] = agg.q3 - agg.q1 | ||||
|     agg['CV']  = agg['std'] / agg['mean']            # coefficient of variation | ||||
| 
 | ||||
|     # merge back the group mean and std so each row can be scored | ||||
|     atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code') | ||||
| 
 | ||||
| 
 | ||||
|     atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std'] | ||||
|     outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3] | ||||
|     outliers | ||||
| 
 | ||||
| def cell7(): | ||||
|     from scipy.stats import median_abs_deviation | ||||
| 
 | ||||
|     def mad_z(series): | ||||
|         med = series.median() | ||||
|         mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ | ||||
|         return (series - med) / mad | ||||
| 
 | ||||
|     atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) | ||||
| 
 | ||||
| def cell10(): | ||||
|     import matplotlib.ticker as mtick # For percentage formatting | ||||
|     import matplotlib.colors as mcolors # For color conversion | ||||
| 
 | ||||
|     summary_data = [] | ||||
| 
 | ||||
|     for code, label in occupation_major_codes.items(): | ||||
|         occ_df = df_tasks[df_tasks['onetsoc_major'] == code] | ||||
|         total_tasks_in_occ = len(occ_df) | ||||
| 
 | ||||
|         if total_tasks_in_occ == 0: | ||||
|             continue # Skip if no tasks for this occupation | ||||
| 
 | ||||
|         # Stack 1: % that isn't equal to "remote" | ||||
|         not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) | ||||
| 
 | ||||
|         # For the remaining remote tasks: | ||||
|         remote_df = occ_df[occ_df['remote_status'] == 'remote'] | ||||
| 
 | ||||
|         # Stack 2: % of remote + ATOMIC | ||||
|         remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC']) | ||||
| 
 | ||||
|         # Stack 3: % of remote + ONGOING-CONSTRAINT | ||||
|         remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT']) | ||||
| 
 | ||||
|         summary_data.append({ | ||||
|             'onetsoc_major_code': code, | ||||
|             'occupation_label': label, | ||||
|             'count_not_remote': not_remote_count, | ||||
|             'count_remote_atomic': remote_atomic_count, | ||||
|             'count_remote_ongoing': remote_ongoing_count, | ||||
|             'total_tasks': total_tasks_in_occ | ||||
|         }) | ||||
| 
 | ||||
|     summary_df = pd.DataFrame(summary_data) | ||||
| 
 | ||||
|     # --- 3. Calculate Percentages --- | ||||
|     # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks | ||||
|     summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning | ||||
| 
 | ||||
|     summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100 | ||||
|     summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100 | ||||
|     summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100 | ||||
| 
 | ||||
|     # Select columns for plotting and set index to occupation label | ||||
|     plot_df = summary_df.set_index('occupation_label')[ | ||||
|         ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing'] | ||||
|     ] | ||||
| 
 | ||||
|     # Rename columns for a clearer legend | ||||
|     plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable'] | ||||
| 
 | ||||
|     plot_df = plot_df.sort_values(by='Not Remote', ascending=False) | ||||
| 
 | ||||
| 
 | ||||
|     # --- 4. Plotting (Modified) --- | ||||
| 
 | ||||
|     # Define the custom colors based on your requirements | ||||
|     # The order must match the column order in plot_df: | ||||
|     # 1. 'Not Remote' | ||||
|     # 2. 'Remote & ATOMIC' | ||||
|     # 3. 'Remote & ONGOING-CONSTRAINT' | ||||
|     bar_colors = [gray["300"], lime["500"], lime["200"]] | ||||
| 
 | ||||
|     fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability | ||||
| 
 | ||||
|     plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors) | ||||
| 
 | ||||
|     ax.set_xlabel("Percentage of Tasks (%)", fontsize=12) | ||||
|     ax.set_ylabel("Occupation Major Group", fontsize=12) | ||||
|     ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20) | ||||
| 
 | ||||
|     # Format x-axis as percentages | ||||
|     ax.xaxis.set_major_formatter(mtick.PercentFormatter()) | ||||
|     plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100% | ||||
| 
 | ||||
|     # Remove right and top spines | ||||
|     ax.spines['right'].set_visible(False) | ||||
|     ax.spines['top'].set_visible(False) | ||||
| 
 | ||||
|     # Function to get contrasting text color | ||||
|     def get_contrasting_text_color(bg_color_hex_or_rgba): | ||||
|         """ | ||||
|         Determines if black or white text provides better contrast against a given background color. | ||||
|         bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]). | ||||
|         Returns: 'black' or 'white'. | ||||
|         """ | ||||
|         # Convert to RGBA if it's a hex string or name | ||||
|         if isinstance(bg_color_hex_or_rgba, str): | ||||
|             rgba = mcolors.to_rgba(bg_color_hex_or_rgba) | ||||
|         else: | ||||
|             rgba = bg_color_hex_or_rgba | ||||
| 
 | ||||
|         r, g, b, _ = rgba # Ignore alpha for luminance calculation | ||||
|         # Calculate luminance (standard formula for sRGB) | ||||
|         # Values r, g, b should be in [0, 1] for this formula | ||||
|         luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b | ||||
|         # Threshold for deciding text color | ||||
|         return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual | ||||
| 
 | ||||
|     # Add percentages inside each bar segment | ||||
|     # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.) | ||||
|     for i, container in enumerate(ax.containers): | ||||
|         # Get the color for this container/category | ||||
|         segment_color = bar_colors[i] | ||||
|         text_color = get_contrasting_text_color(segment_color) | ||||
| 
 | ||||
|         for patch in container.patches: # Iterate through each bar segment in the category | ||||
|             width = patch.get_width() | ||||
|             if width > 3:  # Only add text if segment is wide enough (e.g., >3%) | ||||
|                 x = patch.get_x() + width / 2 | ||||
|                 y = patch.get_y() + patch.get_height() / 2 | ||||
|                 ax.text(x, y, | ||||
|                         f"{width:.1f}%", | ||||
|                         ha='center', | ||||
|                         va='center', | ||||
|                         fontsize=8, # Adjust font size as needed | ||||
|                         color=text_color, | ||||
|                         fontweight='medium') # Bolder text can help | ||||
| 
 | ||||
| 
 | ||||
|     plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False) | ||||
| 
 | ||||
| def cell11(): | ||||
|     df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2] | ||||
| 
 | ||||
|     # Calculate wage bill per occupation | ||||
|     # Wage bill = Total Employment * Annual Mean Wage | ||||
|     # Ensure columns are numeric, converting non-numeric values to NaN first | ||||
|     df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce') | ||||
|     df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce') | ||||
| 
 | ||||
|     # Drop rows with NaN in necessary columns after coercion | ||||
|     df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True) | ||||
| 
 | ||||
|     df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN'] | ||||
| 
 | ||||
|     # Aggregate wage bill by onetsoc_major | ||||
|     df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index() | ||||
| 
 | ||||
|     # Map major codes to titles for better plotting | ||||
|     df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes) | ||||
| 
 | ||||
|     # Sort by wage bill for better visualization | ||||
|     df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False) | ||||
| 
 | ||||
|     # Plotting | ||||
|     plt.figure(figsize=(12, 8)) | ||||
|     sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis") | ||||
|     plt.title('Total Wage Bill per Major Occupation Group') | ||||
|     plt.xlabel('Total Wage Bill (in billions)') | ||||
|     plt.ylabel('Major Occupation Group') | ||||
|     plt.grid(axis='x', linestyle='--', alpha=0.7) | ||||
| 
 | ||||
| def cell11(): | ||||
|     # ─────────────────────────────────────────────────────────────── | ||||
|     # 1.  CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP | ||||
|     # ─────────────────────────────────────────────────────────────── | ||||
|     def cdf(series): | ||||
|         s = series.sort_values().reset_index(drop=True) | ||||
|         return s.values, ((s.index + 1) / len(s)) * 100 | ||||
| 
 | ||||
|     x_lb , y_lb  = cdf(atomic_tasks['lb_estimate_in_minutes']) | ||||
|     x_ub , y_ub  = cdf(atomic_tasks['ub_estimate_in_minutes']) | ||||
|     x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2) | ||||
| 
 | ||||
|     # ─────────────────────────────────────────────────────────────── | ||||
|     # 2.  PLOTTING | ||||
|     # ─────────────────────────────────────────────────────────────── | ||||
|     fig, ax = plt.subplots(figsize=(10, 6)) | ||||
| 
 | ||||
|     # horizontal reference lines every 10 % | ||||
|     for y_val in range(0, 101, 10): | ||||
|         ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1) | ||||
| 
 | ||||
|     # Plot Lower Bound CDF | ||||
|     ax.step(x_lb, y_lb, | ||||
|             where='post', | ||||
|             color=lime['300'], # Example: light blue for lower bound | ||||
|             linewidth=1.8, | ||||
|             linestyle='--', | ||||
|             zorder=2, | ||||
|             label='Lower bound estimate (CDF)') | ||||
| 
 | ||||
|     # Plot Upper Bound CDF | ||||
|     ax.step(x_ub, y_ub, | ||||
|             where='post', | ||||
|             color=lime['900'], # Example: light orange/red for upper bound | ||||
|             linewidth=1.8, | ||||
|             linestyle=':', | ||||
|             zorder=3, | ||||
|             label='Upper bound estimate (CDF)') | ||||
| 
 | ||||
|     # Plot Midpoint CDF (plotted last to be on top, or adjust zorder) | ||||
|     ax.step(x_mid, y_mid, | ||||
|             where='post', | ||||
|             color=lime['600'], | ||||
|             linewidth=2.2, | ||||
|             zorder=4, # Ensure it's on top of other lines if they overlap significantly | ||||
|             label='Mid-point estimate (CDF)') | ||||
| 
 | ||||
| 
 | ||||
|     # axes limits / scales | ||||
|     ax.set_ylim(0, 100) | ||||
|     ax.set_xscale('log') | ||||
| 
 | ||||
|     # y-axis ➝ percent labels | ||||
|     ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0)) | ||||
| 
 | ||||
| 
 | ||||
|     # move y-label to top-left (just inside plotting area) | ||||
|     ax.text(-0.06, 1.03, | ||||
|             "% of tasks with temporal coherence ≤ X", | ||||
|             ha='left', va='bottom', | ||||
|             transform=ax.transAxes, | ||||
|             fontsize=12, fontweight='semibold') | ||||
| 
 | ||||
|     # custom x-ticks at human-friendly durations | ||||
|     ticks      = [1, 5, 10, 30, 60, 120, 240, 480, | ||||
|                 1440, 2880, 10080, 43200, 129600, | ||||
|                 259200, 525600] | ||||
|     ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours', | ||||
|                 '1 day', '2 days', '1 week', '30 days', | ||||
|                 '90 days', '180 days', '1 year'] | ||||
| 
 | ||||
|     # Vertical reference lines for x-ticks | ||||
|     for tick in ticks: | ||||
|         ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1) | ||||
| 
 | ||||
|     ax.set_xticks(ticks) | ||||
|     ax.set_xticklabels(ticklabels, rotation=45, ha='right') | ||||
| 
 | ||||
|     ax.spines['top'].set_visible(False) | ||||
|     ax.spines['right'].set_visible(False) | ||||
|     ax.spines['left'].set_edgecolor(gray['300']) | ||||
|     ax.spines['bottom'].set_edgecolor(gray['300']) | ||||
| 
 | ||||
| 
 | ||||
|     # legend | ||||
|     ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed | ||||
| 
 | ||||
|     ax.text(0.5, -0.3, | ||||
|             'Temporal coherence (X)', | ||||
|             ha='center', va='center', | ||||
|             transform=ax.transAxes, | ||||
|             fontsize=12, fontweight='semibold') | ||||
|  | @ -1,411 +0,0 @@ | |||
| import pandas as pd | ||||
| import litellm | ||||
| import dotenv | ||||
| import os | ||||
| import time | ||||
| import json | ||||
| import math | ||||
| 
 | ||||
| # Load environment variables | ||||
| dotenv.load_dotenv(override=True) | ||||
| 
 | ||||
| # litellm._turn_on_debug() # Optional debugging | ||||
| 
 | ||||
| # --- Configuration --- | ||||
| MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output | ||||
| RATE_LIMIT = 5000  # Requests per minute | ||||
| CHUNK_SIZE = 300  # Number of unique tasks per API call | ||||
| SECONDS_PER_MINUTE = 60 | ||||
| 
 | ||||
| # File configuration | ||||
| CLASSIFICATION_FILENAME = "tasks_estimateable.csv"  # Output file with classifications | ||||
| TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv" | ||||
| OUTPUT_COLUMN_NAME = "task_estimateable" | ||||
| SOURCE_FILTER_COLUMN = "remote_status" | ||||
| SOURCE_FILTER_VALUE = "remote" | ||||
| 
 | ||||
| # --- Prompts and Schema --- | ||||
| SYSTEM_PROMPT_CLASSIFY = """ | ||||
| Classify the provided O*NET task into one of these categories: | ||||
|  -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. | ||||
|  -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). | ||||
| """.strip() | ||||
| 
 | ||||
| USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}" | ||||
| 
 | ||||
| CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"] | ||||
| 
 | ||||
| SCHEMA_FOR_CLASSIFICATION = { | ||||
|     "name": "classify_task_type", | ||||
|     "strict": True, | ||||
|     "schema": { | ||||
|         "type": "object", | ||||
|         "properties": { | ||||
|             "task_category": { | ||||
|                 "type": "string", | ||||
|                 "enum": CLASSIFICATION_CATEGORIES, | ||||
|                 "description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).", | ||||
|             } | ||||
|         }, | ||||
|         "required": ["task_category"], | ||||
|         "additionalProperties": False, | ||||
|     }, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| def save_dataframe(df_to_save, filename): | ||||
|     """Saves the DataFrame to the specified CSV file using atomic write.""" | ||||
|     try: | ||||
|         temp_filename = filename + ".tmp" | ||||
|         df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) | ||||
|         os.replace(temp_filename, filename) | ||||
|     except Exception as e: | ||||
|         print(f"--- Error saving DataFrame to {filename}: {e} ---") | ||||
|         if os.path.exists(temp_filename): | ||||
|             try: | ||||
|                 os.remove(temp_filename) | ||||
|             except Exception as remove_err: | ||||
|                 print( | ||||
|                     f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" | ||||
|                 ) | ||||
| 
 | ||||
| 
 | ||||
| # --- Load or Initialize DataFrame --- | ||||
| try: | ||||
|     if os.path.exists(CLASSIFICATION_FILENAME): | ||||
|         df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig") | ||||
|         print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.") | ||||
| 
 | ||||
|         save_needed_after_load = False | ||||
|         if OUTPUT_COLUMN_NAME not in df.columns: | ||||
|             df[OUTPUT_COLUMN_NAME] = pd.NA | ||||
|             print(f"Added '{OUTPUT_COLUMN_NAME}' column.") | ||||
|             save_needed_after_load = True | ||||
| 
 | ||||
|         df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True) | ||||
| 
 | ||||
|         if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance( | ||||
|             df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype | ||||
|         ): | ||||
|             try: | ||||
|                 df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) | ||||
|                 print( | ||||
|                     f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}." | ||||
|                 ) | ||||
|                 save_needed_after_load = True | ||||
|             except Exception as e: | ||||
|                 print( | ||||
|                     f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}." | ||||
|                 ) | ||||
| 
 | ||||
|         if "task" not in df.columns: | ||||
|             print( | ||||
|                 f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing." | ||||
|             ) | ||||
|             exit() | ||||
| 
 | ||||
|         if save_needed_after_load: | ||||
|             print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.") | ||||
|             save_dataframe(df, CLASSIFICATION_FILENAME) | ||||
|     else: | ||||
|         print( | ||||
|             f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}." | ||||
|         ) | ||||
|         if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME): | ||||
|             print( | ||||
|                 f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}." | ||||
|             ) | ||||
|             exit() | ||||
| 
 | ||||
|         df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig") | ||||
| 
 | ||||
|         required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN] | ||||
|         missing_source_cols = [ | ||||
|             col for col in required_source_cols_for_init if col not in df_source.columns | ||||
|         ] | ||||
|         if missing_source_cols: | ||||
|             print( | ||||
|                 f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}." | ||||
|             ) | ||||
|             exit() | ||||
| 
 | ||||
|         df_source_filtered = df_source[ | ||||
|             df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE | ||||
|         ].copy() | ||||
| 
 | ||||
|         if df_source_filtered.empty: | ||||
|             print( | ||||
|                 f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. " | ||||
|                 f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially." | ||||
|             ) | ||||
| 
 | ||||
|         df = df_source_filtered[["task"]].copy() | ||||
|         df[OUTPUT_COLUMN_NAME] = pd.NA | ||||
|         df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) | ||||
| 
 | ||||
|         print( | ||||
|             f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} " | ||||
|             f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks." | ||||
|         ) | ||||
|         save_dataframe(df, CLASSIFICATION_FILENAME) | ||||
| 
 | ||||
| except FileNotFoundError: | ||||
|     print(f"Error: A required file was not found. Please check paths.") | ||||
|     exit() | ||||
| except Exception as e: | ||||
|     print(f"Error during DataFrame loading or initialization: {e}") | ||||
|     exit() | ||||
| 
 | ||||
| 
 | ||||
| # --- Identify Unique Tasks to Process --- | ||||
| if df.empty: | ||||
|     print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.") | ||||
|     exit() | ||||
| 
 | ||||
| initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna() | ||||
| 
 | ||||
| if not initial_unprocessed_mask.any(): | ||||
|     print( | ||||
|         f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting." | ||||
|     ) | ||||
|     exit() | ||||
| 
 | ||||
| # Filter for rows that are unprocessed AND have a valid 'task' string | ||||
| valid_tasks_to_consider_df = df[ | ||||
|     initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "") | ||||
| ] | ||||
| 
 | ||||
| if valid_tasks_to_consider_df.empty: | ||||
|     print( | ||||
|         f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting." | ||||
|     ) | ||||
|     exit() | ||||
| 
 | ||||
| unique_task_labels_for_api = ( | ||||
|     valid_tasks_to_consider_df["task"].drop_duplicates().tolist() | ||||
| ) | ||||
| total_rows_to_update_potentially = len( | ||||
|     df[initial_unprocessed_mask] | ||||
| )  # Count all rows that are NA | ||||
| 
 | ||||
| print( | ||||
|     f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification." | ||||
| ) | ||||
| print( | ||||
|     f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API." | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| # --- Prepare messages for batch completion (only for unique task labels) --- | ||||
| messages_list = [] | ||||
| print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...") | ||||
| 
 | ||||
| for task_label in unique_task_labels_for_api: | ||||
|     # task_label is already guaranteed to be non-empty and not NaN from the filtering above | ||||
|     user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label) | ||||
|     messages_for_task = [ | ||||
|         {"role": "system", "content": SYSTEM_PROMPT_CLASSIFY}, | ||||
|         {"role": "user", "content": user_message}, | ||||
|     ] | ||||
|     messages_list.append(messages_for_task) | ||||
| 
 | ||||
| print(f"Prepared {len(messages_list)} message sets for batch completion.") | ||||
| if ( | ||||
|     not messages_list | ||||
| ):  # Should only happen if unique_task_labels_for_api was empty, caught above | ||||
|     print( | ||||
|         "No messages prepared, though unique tasks were identified. This is unexpected. Exiting." | ||||
|     ) | ||||
|     exit() | ||||
| 
 | ||||
| 
 | ||||
| # --- Call batch_completion in chunks with rate limiting and periodic saving --- | ||||
| total_unique_tasks_to_send = len( | ||||
|     messages_list | ||||
| )  # Same as len(unique_task_labels_for_api) | ||||
| num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE) | ||||
| 
 | ||||
| print( | ||||
|     f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..." | ||||
| ) | ||||
| 
 | ||||
| overall_start_time = time.time() | ||||
| processed_rows_count_total = 0  # Counts actual rows updated in the DataFrame | ||||
| 
 | ||||
| for i in range(num_chunks): | ||||
|     chunk_start_message_index = i * CHUNK_SIZE | ||||
|     chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send) | ||||
| 
 | ||||
|     message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] | ||||
|     # Get corresponding unique task labels for this chunk | ||||
|     chunk_task_labels = unique_task_labels_for_api[ | ||||
|         chunk_start_message_index:chunk_end_message_index | ||||
|     ] | ||||
| 
 | ||||
|     if not message_chunk:  # Should not happen if loop range is correct | ||||
|         continue | ||||
| 
 | ||||
|     print( | ||||
|         f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." | ||||
|     ) | ||||
|     chunk_start_time = time.time() | ||||
|     responses = [] | ||||
|     try: | ||||
|         print( | ||||
|             f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..." | ||||
|         ) | ||||
|         responses = litellm.batch_completion( | ||||
|             model=MODEL, | ||||
|             messages=message_chunk, | ||||
|             response_format={ | ||||
|                 "type": "json_schema", | ||||
|                 "json_schema": SCHEMA_FOR_CLASSIFICATION, | ||||
|             }, | ||||
|             num_retries=3, | ||||
|         ) | ||||
|         print(f"Chunk {i + 1} API call completed.") | ||||
| 
 | ||||
|     except Exception as e: | ||||
|         print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") | ||||
|         responses = [None] * len(message_chunk) | ||||
| 
 | ||||
|     # --- Process responses for the current chunk --- | ||||
|     # chunk_updates stores {task_label: classification_category} | ||||
|     chunk_task_classifications = {} | ||||
|     successful_api_calls_in_chunk = 0 | ||||
|     failed_api_calls_in_chunk = 0 | ||||
| 
 | ||||
|     if responses and len(responses) == len(message_chunk): | ||||
|         for j, response in enumerate(responses): | ||||
|             current_task_label = chunk_task_labels[ | ||||
|                 j | ||||
|             ]  # The unique task label for this response | ||||
|             content_str = None | ||||
| 
 | ||||
|             if response is None: | ||||
|                 print( | ||||
|                     f"API call failed for task label '{current_task_label}' (response is None)." | ||||
|                 ) | ||||
|                 failed_api_calls_in_chunk += 1 | ||||
|                 continue | ||||
| 
 | ||||
|             try: | ||||
|                 if ( | ||||
|                     response.choices | ||||
|                     and response.choices[0].message | ||||
|                     and response.choices[0].message.content | ||||
|                 ): | ||||
|                     content_str = response.choices[0].message.content | ||||
|                     classification_data = json.loads(content_str) | ||||
|                     category_raw = classification_data.get("task_category") | ||||
| 
 | ||||
|                     if category_raw in CLASSIFICATION_CATEGORIES: | ||||
|                         successful_api_calls_in_chunk += 1 | ||||
|                         chunk_task_classifications[current_task_label] = category_raw | ||||
|                     else: | ||||
|                         print( | ||||
|                             f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'" | ||||
|                         ) | ||||
|                         failed_api_calls_in_chunk += 1 | ||||
|                 else: | ||||
|                     finish_reason = ( | ||||
|                         response.choices[0].finish_reason | ||||
|                         if (response.choices and response.choices[0].finish_reason) | ||||
|                         else "unknown" | ||||
|                     ) | ||||
|                     error_message = ( | ||||
|                         response.choices[0].message.content | ||||
|                         if (response.choices and response.choices[0].message) | ||||
|                         else "No content in message." | ||||
|                     ) | ||||
|                     print( | ||||
|                         f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. " | ||||
|                         f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" | ||||
|                     ) | ||||
|                     failed_api_calls_in_chunk += 1 | ||||
| 
 | ||||
|             except json.JSONDecodeError: | ||||
|                 print( | ||||
|                     f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'" | ||||
|                 ) | ||||
|                 failed_api_calls_in_chunk += 1 | ||||
|             except AttributeError as ae: | ||||
|                 print( | ||||
|                     f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}" | ||||
|                 ) | ||||
|                 failed_api_calls_in_chunk += 1 | ||||
|             except Exception as e: | ||||
|                 print( | ||||
|                     f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}" | ||||
|                 ) | ||||
|                 failed_api_calls_in_chunk += 1 | ||||
|     else: | ||||
|         print( | ||||
|             f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) " | ||||
|             f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed." | ||||
|         ) | ||||
|         failed_api_calls_in_chunk = len(message_chunk) | ||||
| 
 | ||||
|     # --- Update Main DataFrame and Save Periodically --- | ||||
|     rows_updated_this_chunk = 0 | ||||
|     if chunk_task_classifications: | ||||
|         print( | ||||
|             f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..." | ||||
|         ) | ||||
|         for task_label, category in chunk_task_classifications.items(): | ||||
|             # Update all rows in the main df that match this task_label AND are still NA in the output column | ||||
|             update_condition = (df["task"] == task_label) & ( | ||||
|                 df[OUTPUT_COLUMN_NAME].isna() | ||||
|             ) | ||||
|             num_rows_for_this_task_label = df[update_condition].shape[0] | ||||
| 
 | ||||
|             if num_rows_for_this_task_label > 0: | ||||
|                 df.loc[update_condition, OUTPUT_COLUMN_NAME] = category | ||||
|                 rows_updated_this_chunk += num_rows_for_this_task_label | ||||
| 
 | ||||
|         print( | ||||
|             f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses." | ||||
|         ) | ||||
|         print(f"Saving progress to {CLASSIFICATION_FILENAME}...") | ||||
|         save_dataframe(df, CLASSIFICATION_FILENAME) | ||||
|     else: | ||||
|         print( | ||||
|             f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save." | ||||
|         ) | ||||
| 
 | ||||
|     print( | ||||
|         f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. " | ||||
|         f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}" | ||||
|     ) | ||||
|     processed_rows_count_total += rows_updated_this_chunk | ||||
| 
 | ||||
|     # --- Rate Limiting Pause --- | ||||
|     chunk_end_time = time.time() | ||||
|     chunk_duration = chunk_end_time - chunk_start_time | ||||
|     print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.") | ||||
| 
 | ||||
|     if i < num_chunks - 1: | ||||
|         time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 | ||||
|         min_chunk_duration_for_rate = ( | ||||
|             len(message_chunk) * time_per_request | ||||
|         )  # Based on API calls made | ||||
|         pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) | ||||
| 
 | ||||
|         if pause_needed > 0: | ||||
|             print( | ||||
|                 f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." | ||||
|             ) | ||||
|             time.sleep(pause_needed) | ||||
| 
 | ||||
| overall_end_time = time.time() | ||||
| total_duration_minutes = (overall_end_time - overall_start_time) / 60 | ||||
| print( | ||||
|     f"\nBatch classification finished." | ||||
|     f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run." | ||||
|     f" Total duration: {total_duration_minutes:.2f} minutes." | ||||
| ) | ||||
| 
 | ||||
| print(f"Performing final save to {CLASSIFICATION_FILENAME}...") | ||||
| save_dataframe(df, CLASSIFICATION_FILENAME) | ||||
| 
 | ||||
| print("\nScript finished.") | ||||
|  | @ -1,85 +0,0 @@ | |||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| # Set database name and directories | ||||
| ONET_DB_NAME="onet.database" | ||||
| ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" | ||||
| ONET_ZIP_FILE="db_29_1_mysql.zip" | ||||
| ONET_EXTRACT_DIR="db_29_1_mysql" | ||||
| 
 | ||||
| # Download O*NET database only if not already downloaded | ||||
| if [ ! -f "$ONET_ZIP_FILE" ]; then | ||||
|     echo "Downloading O*NET database from $ONET_ZIP_URL" | ||||
|     curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL" | ||||
| 
 | ||||
|     if [ $? -ne 0 ]; then | ||||
|         echo "Failed to download O*NET database" | ||||
|         exit 1 | ||||
|     fi | ||||
| else | ||||
|     echo "Using existing O*NET database zip file" | ||||
| fi | ||||
| 
 | ||||
| # Extract downloaded zip file only if extraction directory doesn't exist | ||||
| if [ ! -d "$ONET_EXTRACT_DIR" ]; then | ||||
|     echo "Extracting O*NET database files" | ||||
|     unzip -o "$ONET_ZIP_FILE" | ||||
| 
 | ||||
|     if [ $? -ne 0 ]; then | ||||
|         echo "Failed to extract O*NET database files" | ||||
|         exit 1 | ||||
|     fi | ||||
| else | ||||
|     echo "Using existing extracted O*NET database files" | ||||
| fi | ||||
| 
 | ||||
| # Remove existing database if it exists | ||||
| if [ -f "$ONET_DB_NAME" ]; then | ||||
|     echo "Removing existing database" | ||||
|     rm "$ONET_DB_NAME" | ||||
| fi | ||||
| 
 | ||||
| # Create a new SQLite database with optimized settings for fast import | ||||
| echo "Creating new SQLite database: $ONET_DB_NAME with performance settings" | ||||
| sqlite3 "$ONET_DB_NAME" << EOF | ||||
| PRAGMA journal_mode = OFF; | ||||
| PRAGMA synchronous = 0; | ||||
| PRAGMA cache_size = 1000000; | ||||
| PRAGMA locking_mode = EXCLUSIVE; | ||||
| PRAGMA temp_store = MEMORY; | ||||
| PRAGMA foreign_keys = ON; | ||||
| EOF | ||||
| 
 | ||||
| # Combine and execute all SQL files in one transaction | ||||
| echo "Executing SQL files in alphabetical order (single transaction mode)" | ||||
| sqlite3 "$ONET_DB_NAME" << EOF | ||||
| BEGIN TRANSACTION; | ||||
| $(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat) | ||||
| COMMIT; | ||||
| EOF | ||||
| 
 | ||||
| # Check if the execution was successful | ||||
| if [ $? -ne 0 ]; then | ||||
|     echo "Error executing SQL files in batch transaction" | ||||
|     exit 1 | ||||
| else | ||||
|     echo "Database populated successfully. Restoring reliability settings..." | ||||
| 
 | ||||
|     # Restore reliability-focused settings after import | ||||
|     sqlite3 "$ONET_DB_NAME" << EOF | ||||
| PRAGMA journal_mode = WAL; | ||||
| PRAGMA synchronous = NORMAL; | ||||
| PRAGMA locking_mode = NORMAL; | ||||
| PRAGMA temp_store = DEFAULT; | ||||
| PRAGMA foreign_keys = ON; | ||||
| PRAGMA optimize; | ||||
| VACUUM; | ||||
| EOF | ||||
| 
 | ||||
|     if [ $? -ne 0 ]; then | ||||
|         echo "Warning: Failed to restore reliability settings, but database is populated" | ||||
|     else | ||||
|         echo "Reliability settings restored successfully" | ||||
|     fi | ||||
| 
 | ||||
|     echo "O*NET database created and optimized successfully!" | ||||
| fi | ||||
|  | @ -1,392 +0,0 @@ | |||
| import sqlite3 | ||||
| import pandas as pd | ||||
| import json | ||||
| import os | ||||
| from collections import defaultdict | ||||
| import numpy as np | ||||
| 
 | ||||
| # --- Configuration --- | ||||
| DB_FILE = "onet.database" | ||||
| OUTPUT_FILE = "task_ratings_enriched.json"  # Changed output filename | ||||
| 
 | ||||
| # --- Database Interaction --- | ||||
| 
 | ||||
| 
 | ||||
| def fetch_data_from_db(db_path): | ||||
|     """ | ||||
|     Fetches required data from the O*NET SQLite database using JOINs, | ||||
|     including DWAs. | ||||
| 
 | ||||
|     Args: | ||||
|         db_path (str): Path to the SQLite database file. | ||||
| 
 | ||||
|     Returns: | ||||
|         tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing: | ||||
|             - DataFrame with task ratings info. | ||||
|             - DataFrame with task-to-DWA mapping. | ||||
|         Returns (None, None) if the database file doesn't exist or an error occurs. | ||||
|     """ | ||||
|     if not os.path.exists(db_path): | ||||
|         print(f"Error: Database file not found at {db_path}") | ||||
|         return None, None | ||||
| 
 | ||||
|     try: | ||||
|         conn = sqlite3.connect(db_path) | ||||
|         # Construct the SQL query to join the tables and select necessary columns | ||||
|         # Added LEFT JOINs for tasks_to_dwas and dwa_reference | ||||
|         # Use LEFT JOIN in case a task has no DWAs | ||||
|         query = """ | ||||
|         SELECT | ||||
|             tr.onetsoc_code, | ||||
|             tr.task_id, | ||||
|             ts.task, | ||||
|             od.title AS occupation_title, | ||||
|             od.description AS occupation_description, | ||||
|             tr.scale_id, | ||||
|             tr.category, | ||||
|             tr.data_value, | ||||
|             dr.dwa_title  -- Added DWA title | ||||
|         FROM | ||||
|             task_ratings tr | ||||
|         JOIN | ||||
|             task_statements ts ON tr.task_id = ts.task_id | ||||
|         JOIN | ||||
|             occupation_data od ON tr.onetsoc_code = od.onetsoc_code | ||||
|         LEFT JOIN | ||||
|             tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id -- | ||||
|         LEFT JOIN | ||||
|             dwa_reference dr ON td.dwa_id = dr.dwa_id; -- | ||||
|         """ | ||||
|         df = pd.read_sql_query(query, conn) | ||||
|         conn.close() | ||||
|         print( | ||||
|                 f"Successfully fetched {len(df)} records (including DWA info) from the database." | ||||
|                 ) | ||||
| 
 | ||||
|         if df.empty: | ||||
|             print("Warning: Fetched DataFrame is empty.") | ||||
|             # Return empty DataFrames with expected columns if the main fetch is empty | ||||
|             ratings_cols = [ | ||||
|                 "onetsoc_code", | ||||
|                 "task_id", | ||||
|                 "task", | ||||
|                 "occupation_title", | ||||
|                 "occupation_description", | ||||
|                 "scale_id", | ||||
|                 "category", | ||||
|                 "data_value", | ||||
|             ] | ||||
|             dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] | ||||
|             return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols) | ||||
| 
 | ||||
|         # Remove duplicates caused by joining ratings with potentially multiple DWAs per task | ||||
|         # Keep only unique combinations of the core task/rating info before processing | ||||
|         core_cols = [ | ||||
|             "onetsoc_code", | ||||
|             "task_id", | ||||
|             "task", | ||||
|             "occupation_title", | ||||
|             "occupation_description", | ||||
|             "scale_id", | ||||
|             "category", | ||||
|             "data_value", | ||||
|         ] | ||||
|         # Check if all core columns exist before attempting to drop duplicates | ||||
|         missing_core_cols = [col for col in core_cols if col not in df.columns] | ||||
|         if missing_core_cols: | ||||
|             print(f"Error: Missing core columns in fetched data: {missing_core_cols}") | ||||
|             return None, None | ||||
|         ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True) | ||||
| 
 | ||||
|         # Get unique DWA info separately | ||||
|         dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] | ||||
|         # Check if all DWA columns exist before processing | ||||
|         if all(col in df.columns for col in dwa_cols): | ||||
|             dwas_df = ( | ||||
|                 df[dwa_cols] | ||||
|                 .dropna(subset=["dwa_title"]) | ||||
|                 .drop_duplicates() | ||||
|                 .reset_index(drop=True) | ||||
|             ) | ||||
|         else: | ||||
|             print("Warning: DWA related columns missing, creating empty DWA DataFrame.") | ||||
|             dwas_df = pd.DataFrame( | ||||
|                 columns=dwa_cols | ||||
|             )  # Create empty df if columns missing | ||||
| 
 | ||||
|         return ratings_df, dwas_df  # Return two dataframes now | ||||
| 
 | ||||
|     except sqlite3.Error as e: | ||||
|         print(f"SQLite error: {e}") | ||||
|         if "conn" in locals() and conn: | ||||
|             conn.close() | ||||
|         return None, None  # Return None for both if error | ||||
|     except Exception as e: | ||||
|         print(f"An error occurred during data fetching: {e}") | ||||
|         if "conn" in locals() and conn: | ||||
|             conn.close() | ||||
|         return None, None  # Return None for both if error | ||||
| 
 | ||||
| 
 | ||||
| # --- Data Processing --- | ||||
| 
 | ||||
| 
 | ||||
| def process_task_ratings_with_dwas(ratings_df, dwas_df): | ||||
|     """ | ||||
|     Processes the fetched data to group, pivot frequency, calculate averages, | ||||
|     structure the output, and add associated DWAs. | ||||
| 
 | ||||
|     Args: | ||||
|         ratings_df (pandas.DataFrame): The input DataFrame with task ratings info. | ||||
|         dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty. | ||||
| 
 | ||||
|     Returns: | ||||
|         list: A list of dictionaries, each representing an enriched task rating with DWAs. | ||||
|               Returns None if the input ratings DataFrame is invalid. | ||||
|     """ | ||||
|     if ratings_df is None or not isinstance( | ||||
|         ratings_df, pd.DataFrame | ||||
|     ):  # Check if it's a DataFrame | ||||
|         print("Error: Input ratings DataFrame is invalid.") | ||||
|         return None | ||||
|     if ratings_df.empty: | ||||
|         print( | ||||
|             "Warning: Input ratings DataFrame is empty. Processing will yield empty result." | ||||
|         ) | ||||
|         # Decide how to handle empty input, maybe return empty list directly | ||||
|         # return [] | ||||
| 
 | ||||
|     # Ensure dwas_df is a DataFrame, even if empty | ||||
|     if dwas_df is None or not isinstance(dwas_df, pd.DataFrame): | ||||
|         print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.") | ||||
|         dwas_df = pd.DataFrame( | ||||
|             columns=["onetsoc_code", "task_id", "dwa_title"] | ||||
|         )  # Ensure it's an empty DF | ||||
| 
 | ||||
|     print("Starting data processing...") | ||||
| 
 | ||||
|     # --- 1. Handle Frequency (FT) --- | ||||
|     freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() | ||||
|     if not freq_df.empty: | ||||
|         freq_pivot = freq_df.pivot_table( | ||||
|             index=["onetsoc_code", "task_id"], | ||||
|             columns="category", | ||||
|             values="data_value", | ||||
|             fill_value=0, | ||||
|         ) | ||||
|         freq_pivot.columns = [ | ||||
|             f"frequency_category_{int(col)}" for col in freq_pivot.columns | ||||
|         ] | ||||
|         print(f"Processed Frequency data. Shape: {freq_pivot.shape}") | ||||
|     else: | ||||
|         print("No Frequency (FT) data found.") | ||||
|         # Create an empty DataFrame with the multi-index to allow merging later | ||||
|         idx = pd.MultiIndex( | ||||
|             levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] | ||||
|         ) | ||||
|         freq_pivot = pd.DataFrame(index=idx) | ||||
| 
 | ||||
|     # --- 2. Handle Importance (IM, IJ) --- | ||||
|     imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() | ||||
|     if not imp_df.empty: | ||||
|         imp_avg = ( | ||||
|             imp_df.groupby(["onetsoc_code", "task_id"])["data_value"] | ||||
|             .mean() | ||||
|             .reset_index() | ||||
|         ) | ||||
|         imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) | ||||
|         print(f"Processed Importance data. Shape: {imp_avg.shape}") | ||||
|     else: | ||||
|         print("No Importance (IM, IJ) data found.") | ||||
|         imp_avg = pd.DataFrame( | ||||
|             columns=["onetsoc_code", "task_id", "importance_average"] | ||||
|         ) | ||||
| 
 | ||||
|     # --- 3. Handle Relevance (RT) --- | ||||
|     rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() | ||||
|     if not rel_df.empty: | ||||
|         rel_avg = ( | ||||
|             rel_df.groupby(["onetsoc_code", "task_id"])["data_value"] | ||||
|             .mean() | ||||
|             .reset_index() | ||||
|         ) | ||||
|         rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) | ||||
|         print(f"Processed Relevance data. Shape: {rel_avg.shape}") | ||||
|     else: | ||||
|         print("No Relevance (RT) data found.") | ||||
|         rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"]) | ||||
| 
 | ||||
|     # --- 4. Process DWAs --- | ||||
|     if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns: | ||||
|         print("Processing DWA data...") | ||||
|         # Group DWAs by task_id and aggregate titles into a list | ||||
|         dwas_grouped = ( | ||||
|             dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"] | ||||
|             .apply(list) | ||||
|             .reset_index() | ||||
|         )  # | ||||
|         dwas_grouped.rename( | ||||
|             columns={"dwa_title": "dwas"}, inplace=True | ||||
|         )  # Rename column to 'dwas' | ||||
|         print(f"Processed DWA data. Shape: {dwas_grouped.shape}") | ||||
|     else: | ||||
|         print("No valid DWA data found or provided for processing.") | ||||
|         dwas_grouped = None  # Set to None if no DWAs | ||||
| 
 | ||||
|     # --- 5. Get Base Task/Occupation Info --- | ||||
|     base_cols = [ | ||||
|         "onetsoc_code", | ||||
|         "task_id", | ||||
|         "task", | ||||
|         "occupation_title", | ||||
|         "occupation_description", | ||||
|     ] | ||||
|     # Check if base columns exist in ratings_df | ||||
|     missing_base_cols = [col for col in base_cols if col not in ratings_df.columns] | ||||
|     if missing_base_cols: | ||||
|         print( | ||||
|             f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed." | ||||
|         ) | ||||
|         return None | ||||
|     if not ratings_df.empty: | ||||
|         base_info = ( | ||||
|             ratings_df[base_cols] | ||||
|             .drop_duplicates() | ||||
|             .set_index(["onetsoc_code", "task_id"]) | ||||
|         ) | ||||
|         print(f"Extracted base info. Shape: {base_info.shape}") | ||||
|     else: | ||||
|         print("Cannot extract base info from empty ratings DataFrame.") | ||||
|         # Create an empty df with index to avoid errors later if possible | ||||
|         idx = pd.MultiIndex( | ||||
|             levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] | ||||
|         ) | ||||
|         base_info = pd.DataFrame( | ||||
|             index=idx, | ||||
|             columns=[ | ||||
|                 col for col in base_cols if col not in ["onetsoc_code", "task_id"] | ||||
|             ], | ||||
|         ) | ||||
| 
 | ||||
|     # --- 6. Merge Processed Data --- | ||||
|     print("Merging processed data...") | ||||
|     # Start with base_info, which should have the index ['onetsoc_code', 'task_id'] | ||||
|     final_df = base_info.merge( | ||||
|         freq_pivot, left_index=True, right_index=True, how="left" | ||||
|     ) | ||||
|     # Reset index before merging non-indexed dfs | ||||
|     final_df = final_df.reset_index() | ||||
| 
 | ||||
|     # Merge averages - check if they are not empty before merging | ||||
|     if not imp_avg.empty: | ||||
|         final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") | ||||
|     else: | ||||
|         final_df["importance_average"] = np.nan  # Add column if imp_avg was empty | ||||
| 
 | ||||
|     if not rel_avg.empty: | ||||
|         final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") | ||||
|     else: | ||||
|         final_df["relevance_average"] = np.nan  # Add column if rel_avg was empty | ||||
| 
 | ||||
|     # Merge DWAs if available | ||||
|     if dwas_grouped is not None and not dwas_grouped.empty: | ||||
|         final_df = final_df.merge( | ||||
|             dwas_grouped, on=["onetsoc_code", "task_id"], how="left" | ||||
|         )  # Merge the dwas list | ||||
|         # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists | ||||
|         # Check if 'dwas' column exists before applying function | ||||
|         if "dwas" in final_df.columns: | ||||
|             final_df["dwas"] = final_df["dwas"].apply( | ||||
|                 lambda x: x if isinstance(x, list) else [] | ||||
|             )  # Ensure tasks without DWAs get [] | ||||
|         else: | ||||
|             print("Warning: 'dwas' column not created during merge.") | ||||
|             final_df["dwas"] = [ | ||||
|                 [] for _ in range(len(final_df)) | ||||
|             ]  # Add empty list column | ||||
| 
 | ||||
|     else: | ||||
|         # Add an empty 'dwas' column if no DWA data was processed or merged | ||||
|         final_df["dwas"] = [[] for _ in range(len(final_df))] | ||||
| 
 | ||||
|     print(f"Final merged data shape: {final_df.shape}") | ||||
| 
 | ||||
|     # Convert DataFrame to list of dictionaries for JSON output | ||||
|     # Handle potential NaN values during JSON conversion | ||||
|     # Replace numpy NaN with Python None for JSON compatibility | ||||
|     final_df = final_df.replace({np.nan: None}) | ||||
|     result_list = final_df.to_dict(orient="records") | ||||
| 
 | ||||
|     return result_list | ||||
| 
 | ||||
| 
 | ||||
| # --- Output --- | ||||
| 
 | ||||
| 
 | ||||
| def write_to_json(data, output_path): | ||||
|     """ | ||||
|     Writes the processed data to a JSON file. | ||||
| 
 | ||||
|     Args: | ||||
|         data (list): The list of dictionaries to write. | ||||
|         output_path (str): Path to the output JSON file. | ||||
|     """ | ||||
|     if data is None: | ||||
|         print("No data to write to JSON.") | ||||
|         return | ||||
|     if not isinstance(data, list): | ||||
|         print( | ||||
|             f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON." | ||||
|         ) | ||||
|         return | ||||
| 
 | ||||
|     # Create directory if it doesn't exist | ||||
|     output_dir = os.path.dirname(output_path) | ||||
|     if output_dir and not os.path.exists(output_dir): | ||||
|         try: | ||||
|             os.makedirs(output_dir) | ||||
|             print(f"Created output directory: {output_dir}") | ||||
|         except OSError as e: | ||||
|             print(f"Error creating output directory {output_dir}: {e}") | ||||
|             return  # Exit if cannot create directory | ||||
| 
 | ||||
|     try: | ||||
|         with open(output_path, "w", encoding="utf-8") as f: | ||||
|             json.dump(data, f, indent=4, ensure_ascii=False) | ||||
|         print(f"Successfully wrote enriched data to {output_path}") | ||||
|     except IOError as e: | ||||
|         print(f"Error writing JSON file to {output_path}: {e}") | ||||
|     except TypeError as e: | ||||
|         print(f"Error during JSON serialization: {e}. Check data types.") | ||||
|     except Exception as e: | ||||
|         print(f"An unexpected error occurred during JSON writing: {e}") | ||||
| 
 | ||||
| 
 | ||||
| # --- Main Execution --- | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     print("Starting O*NET Task Ratings & DWAs Enrichment Script...") | ||||
|     # 1. Fetch data | ||||
|     ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE)  # Fetch both datasets | ||||
| 
 | ||||
|     # 2. Process data | ||||
|     # Proceed only if ratings_data_df is a valid DataFrame (even if empty) | ||||
|     # dwas_data_df can be None or empty, handled inside process function | ||||
|     if isinstance(ratings_data_df, pd.DataFrame): | ||||
|         enriched_data = process_task_ratings_with_dwas( | ||||
|             ratings_data_df, dwas_data_df | ||||
|         )  # Pass both dataframes | ||||
| 
 | ||||
|         # 3. Write output | ||||
|         if ( | ||||
|             enriched_data is not None | ||||
|         ):  # Check if processing returned data (even an empty list is valid) | ||||
|             write_to_json(enriched_data, OUTPUT_FILE) | ||||
|         else: | ||||
|             print("Data processing failed or returned None. No output file generated.") | ||||
|     else: | ||||
|         print( | ||||
|             "Data fetching failed or returned invalid type for ratings data. Script terminated." | ||||
|         ) | ||||
| 
 | ||||
|     print("Script finished.") | ||||
							
								
								
									
										81
									
								
								pipeline/aggregate.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,81 @@ | |||
| from .utils import OCCUPATION_MAJOR_CODES | ||||
| import pandas as pd | ||||
| 
 | ||||
| def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame: | ||||
|     # --- OESM Wage Bill Calculation --- | ||||
|     df_oesm_with_bill = oesm_df.copy() | ||||
|     df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True) | ||||
| 
 | ||||
|     # Convert key columns to numeric, handling potential errors | ||||
|     df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce') | ||||
|     df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce') | ||||
|     df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True) | ||||
| 
 | ||||
|     # Calculate the wage bill for each occupation | ||||
|     df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN'] | ||||
|     oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code') | ||||
| 
 | ||||
|     summary_data = [] | ||||
| 
 | ||||
|     # Assuming df_tasks has an 'onetsoc_code' column with the full SOC code | ||||
|     unique_soc_codes = df_tasks['onetsoc_code'].unique() | ||||
| 
 | ||||
|     for code in unique_soc_codes: | ||||
|         occ_df = df_tasks[df_tasks['onetsoc_code'] == code] | ||||
|         total_tasks_in_occ = len(occ_df) | ||||
| 
 | ||||
|         not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) | ||||
|         remote_df = occ_df[occ_df['remote_status'] == 'remote'] | ||||
|         remote_estimable_count = len(remote_df[remote_df['estimable']]) | ||||
|         remote_not_estimable_count = len(remote_df[~remote_df['estimable']]) | ||||
| 
 | ||||
|         try: | ||||
|             # O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011). | ||||
|             # We strip the suffix from the O*NET code to find the corresponding wage data. | ||||
|             soc_code_for_lookup = code.split('.')[0] | ||||
|             wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill'] | ||||
|             label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE'] | ||||
|         except KeyError: | ||||
|             wage_bill = 0 | ||||
|             label = "Unknown" | ||||
| 
 | ||||
|         summary_data.append({ | ||||
|             'onetsoc_code': code, | ||||
|             'occupation_label': label, | ||||
|             'wage_bill': wage_bill, | ||||
|             'count_not_remote': not_remote_count, | ||||
|             'count_remote_estimable': remote_estimable_count, | ||||
|             'count_remote_not_estimable': remote_not_estimable_count, | ||||
|             'total_tasks': total_tasks_in_occ | ||||
|         }) | ||||
| 
 | ||||
|     return pd.DataFrame(summary_data) | ||||
| 
 | ||||
| 
 | ||||
| def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame: | ||||
|     df_agg = summary_df.copy() | ||||
|     df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2] | ||||
| 
 | ||||
|     aggregation = { | ||||
|         'wage_bill': 'sum', | ||||
|         'count_not_remote': 'sum', | ||||
|         'count_remote_estimable': 'sum', | ||||
|         'count_remote_not_estimable': 'sum', | ||||
|         'total_tasks': 'sum' | ||||
|     } | ||||
|     major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index() | ||||
| 
 | ||||
|     major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES) | ||||
| 
 | ||||
|     # Reorder columns to match original output format | ||||
|     major_summary = major_summary[[ | ||||
|         'onetsoc_major_code', | ||||
|         'occupation_label', | ||||
|         'wage_bill', | ||||
|         'count_not_remote', | ||||
|         'count_remote_estimable', | ||||
|         'count_remote_not_estimable', | ||||
|         'total_tasks' | ||||
|     ]] | ||||
| 
 | ||||
|     return major_summary | ||||
							
								
								
									
										225
									
								
								pipeline/classification.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,225 @@ | |||
| from pathlib import Path | ||||
| import pandas as pd | ||||
| from .logger import logger | ||||
| from .utils import enrich | ||||
| import json | ||||
| 
 | ||||
| ALLOWED_UNITS = [ | ||||
|     "minute", | ||||
|     "hour", | ||||
|     "day", | ||||
|     "week", | ||||
|     "month", | ||||
|     "trimester", | ||||
|     "semester", | ||||
|     "year", | ||||
| ] | ||||
| 
 | ||||
| ESTIMABLE_CLASSIFICATION_VERSION = "old_version" | ||||
| TIME_ESTIMATES_GENERATION_VERSION = "old_version" | ||||
| 
 | ||||
| def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame: | ||||
|     CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet" | ||||
|     if CACHE_PATH.exists() and not bust: | ||||
|         logger.info(f"Loading cached task estimability from {CACHE_PATH}") | ||||
|         return pd.read_parquet(CACHE_PATH) | ||||
| 
 | ||||
|     logger.info("Enriching tasks with estimability classification.") | ||||
| 
 | ||||
|     df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy() | ||||
| 
 | ||||
|     logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.") | ||||
| 
 | ||||
|     if df_unique_tasks.empty: | ||||
|         raise ValueError("No unique tasks to classify.") | ||||
| 
 | ||||
|     results = enrich( | ||||
|         model="gpt-4.1-mini", | ||||
|         rpm=5000, | ||||
|         messages_to_process=[ | ||||
|             [ | ||||
|                 {"role": "system", "content":  """ | ||||
|                     Classify the provided O*NET task into one of these categories: | ||||
|                     -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. | ||||
|                     -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). | ||||
|                     """.strip()}, | ||||
|                 {"role": "user", "content": f"Task: {row.task}"}, | ||||
|             ] | ||||
|             for row in df_unique_tasks.itertuples() | ||||
|         ], | ||||
|         schema={ | ||||
|             "name": "estimability_classification", | ||||
|             "schema": { | ||||
|                 "type": "object", | ||||
|                 "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}}, | ||||
|                 "required": ["task_category"], | ||||
|                 "additionalProperties": False | ||||
|             } | ||||
|         }, | ||||
|         chunk_size=300, | ||||
|     ) | ||||
| 
 | ||||
|     if not results or len(results) != len(df_unique_tasks): | ||||
|         raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.") | ||||
| 
 | ||||
|     classifications = [] | ||||
|     for index, response in enumerate(results): | ||||
|         task_label = df_unique_tasks.iloc[index]['task'] | ||||
|         task_category_flag = None | ||||
| 
 | ||||
|         if response is None: | ||||
|             logger.warning(f"API call failed for task (enrich returned None): '{task_label}'") | ||||
|         else: | ||||
|             try: | ||||
|                 content_str = response.choices[0].message.content | ||||
|                 if not content_str: | ||||
|                     raise ValueError("No content found in the response message") | ||||
| 
 | ||||
|                 data = json.loads(content_str) | ||||
| 
 | ||||
|                 if 'task_category' in data and isinstance(data['task_category'], str): | ||||
|                     task_category_flag = data['task_category'] | ||||
|                 else: | ||||
|                     logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'") | ||||
|             except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e: | ||||
|                 logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}") | ||||
| 
 | ||||
|         classifications.append({ | ||||
|             'task': task_label, | ||||
|             'estimable': task_category_flag == 'ATOMIC' | ||||
|         }) | ||||
| 
 | ||||
|     classification_df = pd.DataFrame(classifications) | ||||
| 
 | ||||
|     logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.") | ||||
| 
 | ||||
|     logger.info(f"Saving task estimability classifications to {CACHE_PATH}") | ||||
|     classification_df.to_parquet(CACHE_PATH) | ||||
| 
 | ||||
|     return classification_df | ||||
| 
 | ||||
| 
 | ||||
| def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame: | ||||
|     CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet" | ||||
|     if CACHE_PATH.exists() and not bust: | ||||
|         logger.info(f"Loading cached task estimates from {CACHE_PATH}") | ||||
|         return pd.read_parquet(CACHE_PATH) | ||||
| 
 | ||||
|     logger.info("Enriching tasks with time estimates.") | ||||
| 
 | ||||
|     if df_to_process.empty: | ||||
|         raise ValueError("No tasks to process for estimates.") | ||||
| 
 | ||||
|     results = enrich( | ||||
|         model="gpt-4.1-mini", | ||||
|         rpm=5000, | ||||
|         messages_to_process=[ | ||||
|             [ | ||||
|                 { | ||||
|                     "role": "system", | ||||
|                     "content":  """ | ||||
|                         You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision | ||||
| 
 | ||||
|                         'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost. | ||||
| 
 | ||||
|                         Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. | ||||
| 
 | ||||
|                         Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip() | ||||
|                 }, | ||||
|                 { | ||||
|                     "role": "user", | ||||
|                     "content":  f"{row.task} done by {row.occupation_title} ({row.occupation_description})" | ||||
|                 } | ||||
|             ] | ||||
|             for row in df_to_process.itertuples() | ||||
|         ], | ||||
|         schema= { | ||||
|             "name": "estimate_time", | ||||
|             "strict": True, | ||||
|             "schema": { | ||||
|                 "type": "object", | ||||
|                 "properties": { | ||||
|                     "lower_bound_estimate": { | ||||
|                         "type": "object", | ||||
|                         "properties": { | ||||
|                             "quantity": { | ||||
|                                 "type": "number", | ||||
|                                 "description": "The numerical value for the lower bound of the estimate.", | ||||
|                             }, | ||||
|                             "unit": { | ||||
|                                 "type": "string", | ||||
|                                 "enum": ALLOWED_UNITS, | ||||
|                                 "description": "The unit of time for the lower bound.", | ||||
|                             }, | ||||
|                         }, | ||||
|                         "required": ["quantity", "unit"], | ||||
|                         "additionalProperties": False, | ||||
|                     }, | ||||
|                     "upper_bound_estimate": { | ||||
|                         "type": "object", | ||||
|                         "properties": { | ||||
|                             "quantity": { | ||||
|                                 "type": "number", | ||||
|                                 "description": "The numerical value for the upper bound of the estimate.", | ||||
|                             }, | ||||
|                             "unit": { | ||||
|                                 "type": "string", | ||||
|                                 "enum": ALLOWED_UNITS, | ||||
|                                 "description": "The unit of time for the upper bound.", | ||||
|                             }, | ||||
|                         }, | ||||
|                         "required": ["quantity", "unit"], | ||||
|                         "additionalProperties": False, | ||||
|                     }, | ||||
|                 }, | ||||
|                 "required": ["lower_bound_estimate", "upper_bound_estimate"], | ||||
|                 "additionalProperties": False, | ||||
|             }, | ||||
|         }, | ||||
|         chunk_size=200, | ||||
|     ) | ||||
| 
 | ||||
|     if not results or len(results) != len(df_to_process): | ||||
|         raise ValueError(f"API call for task estimates failed or returned mismatched number of results. " | ||||
|             f"Expected {len(df_to_process)}, got {len(results) if results else 0}.") | ||||
| 
 | ||||
|     estimates = [] | ||||
|     for index, response in enumerate(results): | ||||
|         row = df_to_process.iloc[index] | ||||
|         task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}" | ||||
|         lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None | ||||
| 
 | ||||
|         if response is None: | ||||
|             logger.warning(f"API call failed for task (enrich returned None): {task_info}") | ||||
|         else: | ||||
|             try: | ||||
|                 content_str = response.choices[0].message.content | ||||
|                 if not content_str: | ||||
|                     raise ValueError("No content found in the response message") | ||||
| 
 | ||||
|                 data = json.loads(content_str) | ||||
| 
 | ||||
|                 lb_qty = data['lower_bound_estimate']['quantity'] | ||||
|                 lb_unit = data['lower_bound_estimate']['unit'] | ||||
|                 ub_qty = data['upper_bound_estimate']['quantity'] | ||||
|                 ub_unit = data['upper_bound_estimate']['unit'] | ||||
|             except Exception as e: | ||||
|                 logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}") | ||||
|                 lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure | ||||
| 
 | ||||
|         estimates.append({ | ||||
|             'onetsoc_code': row.onetsoc_code, | ||||
|             'task_id': row.task_id, | ||||
|             'lb_estimate_qty': lb_qty, | ||||
|             'lb_estimate_unit': lb_unit, | ||||
|             'ub_estimate_qty': ub_qty, | ||||
|             'ub_estimate_unit': ub_unit | ||||
|         }) | ||||
| 
 | ||||
|     estimates_df = pd.DataFrame(estimates) | ||||
|     logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.") | ||||
| 
 | ||||
|     logger.info(f"Saving task estimates to {CACHE_PATH}") | ||||
|     estimates_df.to_parquet(CACHE_PATH) | ||||
| 
 | ||||
|     return estimates_df | ||||
|  | @ -1,35 +0,0 @@ | |||
| OCCUPATION_MAJOR_CODES = { | ||||
|     '11': 'Management', | ||||
|     '13': 'Business & Financial', | ||||
|     '15': 'Computer & Mathematical', | ||||
|     '17': 'Architecture & Engineering', | ||||
|     '19': 'Life, Physical, & Social Science', | ||||
|     '21': 'Community & Social Service', | ||||
|     '23': 'Legal', | ||||
|     '25': 'Education, Training, & Library', | ||||
|     '27': 'Arts, Design, & Media', | ||||
|     '29': 'Healthcare Practitioners', | ||||
|     '31': 'Healthcare Support', | ||||
|     '33': 'Protective Service', | ||||
|     '35': 'Food Preparation & Serving', | ||||
|     '37': 'Building & Grounds Maintenance', | ||||
|     '39': 'Personal Care & Service', | ||||
|     '41': 'Sales & Related', | ||||
|     '43': 'Office & Admin Support', | ||||
|     '45': 'Farming, Fishing, & Forestry', | ||||
|     '47': 'Construction & Extraction', | ||||
|     '49': 'Installation, Maintenance, & Repair', | ||||
|     '51': 'Production', | ||||
|     '53': 'Transportation & Material Moving', | ||||
|     '55': 'Military Specific', | ||||
| } | ||||
| 
 | ||||
| GRAY   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', | ||||
|     '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', | ||||
|     '600':'#475569','700':'#334155','800':'#1e293b', | ||||
|     '900':'#0f172a','950':'#020617'} | ||||
| 
 | ||||
| LIME            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', | ||||
|     '300': '#bbf451','400': '#9ae600','500': '#83cd00', | ||||
|     '600': '#64a400','700': '#497d00','800': '#3c6300', | ||||
|     '900': '#35530e','950': '#192e03'} | ||||
|  | @ -1,97 +0,0 @@ | |||
| """ | ||||
| This module enriches data, they take time to run, and are usually expensive (API calls...), | ||||
| they should manage their own state, and only be run if the data's version is different than | ||||
| their save. | ||||
| """ | ||||
| from .run import Run | ||||
| import pandas as pd | ||||
| from typing import Any, List, Dict | ||||
| import litellm | ||||
| 
 | ||||
| def enrich( | ||||
|     model: str, | ||||
|     rpm: int, | ||||
|     messages_to_process: List[List[Dict[str, str]]], | ||||
|     schema: Dict[str, Any], | ||||
|     chunk_size: int = 100, | ||||
| ): | ||||
|     # Use litellm.batch_completion | ||||
|     pass | ||||
| 
 | ||||
| def enrich_with_task_estimateability(run: Run) -> pd.DataFrame: | ||||
|     output_path = run.cache_dir / "computed_task_estimateability.parquet" | ||||
|     if output_path.exists(): | ||||
|         print(f"Loading cached task estimateability from {output_path}") | ||||
|         return pd.read_parquet(output_path) | ||||
| 
 | ||||
|     df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy() | ||||
| 
 | ||||
|     # In the old script, we only passed unique tasks to the API | ||||
|     df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task']) | ||||
| 
 | ||||
| 
 | ||||
|     results = enrich( | ||||
|         model="gpt-4.1-mini", | ||||
|         rpm=5000, | ||||
|         messages_to_process=[ | ||||
|             [ | ||||
|                 {"role": "system", "content":  """ | ||||
|                     Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no. | ||||
|                     """}, | ||||
|                 {"role": "user", "content": f"Task: {row.task}"}, | ||||
|             ] | ||||
|             for row in df_unique_tasks.itertuples() | ||||
|         ], | ||||
|         schema={ | ||||
|             "type": "object", | ||||
|             "properties": {"estimateable": {"type": "bool"}}, | ||||
|             "required": ["estimateable"] | ||||
|         }, | ||||
|         chunk_size=300, | ||||
|     ) | ||||
| 
 | ||||
|     # Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique. | ||||
|     return pd.DataFrame() | ||||
| 
 | ||||
| def enrich_with_task_estimates(run: Run) -> pd.DataFrame: | ||||
|     output_path = run.cache_dir / "computed_task_estimates.parquet" | ||||
|     if output_path.exists(): | ||||
|         print(f"Loading cached task estimates from {output_path}") | ||||
|         return pd.read_parquet(output_path) | ||||
| 
 | ||||
|     df = ... # todo | ||||
| 
 | ||||
|     results = enrich( | ||||
|         model="gpt-4.1-mini", | ||||
|         rpm=5000, | ||||
|         messages_to_process=[ | ||||
|             [ | ||||
|                 {"role": "system", "content":  "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."}, | ||||
|                 {"role": "user", "content":  f""" | ||||
|                     Task: {row.task} | ||||
|                     For Occupation: {row.occupation_title} | ||||
|                     Occupation Description: {row.occupation_description}"""} | ||||
|             ] | ||||
|             for row in df.itertuples() | ||||
|         ], | ||||
|         schema={ | ||||
|             "type": "object", | ||||
|             "properties": { | ||||
|                 "lower_bound_estimate": { | ||||
|                     "type": "object", | ||||
|                     "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}}, | ||||
|                     "required": ["quantity", "unit"], | ||||
|                 }, | ||||
|                 "upper_bound_estimate": { | ||||
|                     "type": "object", | ||||
|                     "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}}, | ||||
|                     "required": ["quantity", "unit"], | ||||
|                 }, | ||||
|             }, | ||||
|             "required": ["lower_bound_estimate", "upper_bound_estimate"], | ||||
|         }, | ||||
|         chunk_size=200, | ||||
|     ) | ||||
| 
 | ||||
|     # Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique. | ||||
|     raise NotImplementedError | ||||
|  | @ -1,50 +1,30 @@ | |||
| """ | ||||
| Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum. | ||||
| """ | ||||
| 
 | ||||
| import sqlite3 | ||||
| from typing import Tuple | ||||
| import pandas as pd | ||||
| import requests | ||||
| import io | ||||
| import zipfile | ||||
| from pipeline.run import Run | ||||
| from pipeline.logger import logger | ||||
| import yaml | ||||
| from pathlib import Path | ||||
| from .logger import logger | ||||
| from typing import Tuple, Dict | ||||
| 
 | ||||
| def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: | ||||
|     """ | ||||
|     Downloads the O*NET database, creates a local SQLite file from it, and returns a connection. | ||||
|     """ | ||||
|     version  = "29_1" | ||||
|     url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip" | ||||
|     db_path = run.cache_dir / f"onet_{version}.db" | ||||
|     run.meta.fetchers['onet'] = { | ||||
|         'url': url, | ||||
|         'version': version, | ||||
|         'db_path': str(db_path), | ||||
|     } | ||||
| ONET_VERSION  = "29_1" | ||||
| ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip" | ||||
| 
 | ||||
|     if db_path.exists(): | ||||
|         logger.info(f"Using cached O*NET database: {db_path}") | ||||
|         conn = sqlite3.connect(db_path) | ||||
|         return conn, version | ||||
| def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection: | ||||
|     DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db" | ||||
| 
 | ||||
|     logger.info(f"Downloading O*NET database from {url}") | ||||
|     response = requests.get(url, stream=True, headers={ | ||||
|     if DB_PATH.exists(): | ||||
|         logger.info(f"Using cached O*NET database: {DB_PATH}") | ||||
|         return sqlite3.connect(DB_PATH) | ||||
| 
 | ||||
|     logger.info(f"Downloading O*NET database from {ONET_URL}") | ||||
|     response = requests.get(ONET_URL, stream=True, headers={ | ||||
|         "User-Agent": "econ-agent/1.0" | ||||
|     }) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
|     # Read content into memory | ||||
|     zip_content = response.content | ||||
| 
 | ||||
|     db_path = run.cache_dir / f"onet_{version}.db" | ||||
| 
 | ||||
|     logger.info(f"Creating new O*NET database: {db_path}") | ||||
|     conn = sqlite3.connect(db_path) | ||||
| 
 | ||||
|     # Set performance PRAGMAs for fast import | ||||
|     logger.info("Creating new SQLite database with performance settings") | ||||
|     conn = sqlite3.connect(DB_PATH) | ||||
|     conn.executescript(""" | ||||
|         PRAGMA journal_mode = OFF; | ||||
|         PRAGMA synchronous = 0; | ||||
|  | @ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: | |||
|         PRAGMA foreign_keys = ON; | ||||
|     """) | ||||
| 
 | ||||
|     zip_content = response.content | ||||
|     with zipfile.ZipFile(io.BytesIO(zip_content)) as z: | ||||
|         sql_scripts = [] | ||||
|         for filename in sorted(z.namelist()): | ||||
|  | @ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: | |||
|         if not sql_scripts: | ||||
|             raise RuntimeError("No SQL files found in the O*NET zip archive.") | ||||
| 
 | ||||
|         # Combine and execute all SQL files in one transaction | ||||
|         full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" | ||||
| 
 | ||||
|         logger.info("Executing SQL files in alphabetical order (single transaction mode)") | ||||
|         full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" | ||||
|         conn.executescript(full_script) | ||||
|         logger.info("Database populated successfully. Restoring reliability settings...") | ||||
| 
 | ||||
|     # Restore reliability-focused settings after import | ||||
|     conn.executescript(""" | ||||
|         PRAGMA journal_mode = WAL; | ||||
|         PRAGMA synchronous = NORMAL; | ||||
|  | @ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: | |||
|     """) | ||||
|     conn.execute("VACUUM;") | ||||
|     conn.commit() | ||||
|     logger.info("Reliability settings restored and database optimized successfully!") | ||||
| 
 | ||||
|     return conn, version | ||||
|     return conn | ||||
| 
 | ||||
| def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]: | ||||
|     """ | ||||
|     Downloads the OESM national data from the BLS website. | ||||
|     """ | ||||
|     version = "23" | ||||
|     url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip" | ||||
|     parquet_path = run.cache_dir / "oesm.parquet" | ||||
|     run.meta.fetchers['oesm'] = { | ||||
|         'url': url, | ||||
|         'version': version, | ||||
|         'parquet_path': str(parquet_path), | ||||
|     } | ||||
| def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame: | ||||
|     VERSION = "23" | ||||
|     URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip" | ||||
|     DATA_PATH = cache_dir / "oesm.parquet" | ||||
| 
 | ||||
|     if parquet_path.exists(): | ||||
|         logger.info(f"Using cached OESM data: {parquet_path}") | ||||
|         return pd.read_parquet(parquet_path), version | ||||
|     if DATA_PATH.exists(): | ||||
|         logger.info(f"Using cached OESM data: {DATA_PATH}") | ||||
|         return pd.read_parquet(DATA_PATH) | ||||
| 
 | ||||
|     logger.info(f"Downloading OESM data from {url}") | ||||
|     logger.info(f"Downloading OESM data from {URL}") | ||||
|     headers = {'User-Agent': 'econ-agent/1.0'} | ||||
|     response = requests.get(url, headers=headers) | ||||
|     response = requests.get(URL, headers=headers) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
|     zip_content = response.content | ||||
|     logger.info(f"OESM data version: {version}") | ||||
| 
 | ||||
|     logger.info(f"Creating new OESM data cache: {parquet_path}") | ||||
|     logger.info(f"Creating new OESM data cache: {DATA_PATH}") | ||||
|     with zipfile.ZipFile(io.BytesIO(zip_content)) as z: | ||||
|         # Find the excel file in the zip | ||||
|         excel_filename = None | ||||
|         for filename in z.namelist(): | ||||
|             logger.debug(f"Found file in OESM zip: {filename}") | ||||
|             if filename.lower().endswith(".xlsx"): | ||||
|                 excel_filename = filename | ||||
|                 break | ||||
| 
 | ||||
|         if excel_filename is None: | ||||
|             raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.") | ||||
| 
 | ||||
|         logger.info(f"Reading {excel_filename} from zip archive.") | ||||
|         with z.open(excel_filename) as f: | ||||
|         with z.open(f"oesm{VERSION}national.xlsx") as f: | ||||
|             df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#']) | ||||
| 
 | ||||
|     df.to_parquet(parquet_path) | ||||
|     logger.info(f"Saved OESM data to cache: {parquet_path}") | ||||
|     return df, version | ||||
|     df.to_parquet(DATA_PATH) | ||||
|     logger.info(f"Saved OESM data to cache: {DATA_PATH}") | ||||
|     return df | ||||
| 
 | ||||
| def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]: | ||||
|     """ | ||||
|     Downloads the EPOCH AI remote work task data. | ||||
|     """ | ||||
|     # This is the direct download link constructed from the Google Drive share link | ||||
|     version = "latest" | ||||
|     url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" | ||||
|     parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet" | ||||
|     run.meta.fetchers['epoch_remote'] = { | ||||
|         'url': url, | ||||
|         'version': version, | ||||
|         'parquet_path': str(parquet_path), | ||||
|     } | ||||
| def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame: | ||||
|     URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" | ||||
|     DATA_PATH = cache_dir / f"epoch_remote_latest.parquet" | ||||
| 
 | ||||
|     if parquet_path.exists(): | ||||
|         logger.info(f"Using cached EPOCH remote data: {parquet_path}") | ||||
|         return pd.read_parquet(parquet_path), version | ||||
|     if DATA_PATH.exists(): | ||||
|         logger.info(f"Using cached EPOCH remote data: {DATA_PATH}") | ||||
|         return pd.read_parquet(DATA_PATH) | ||||
| 
 | ||||
|     logger.info(f"Downloading EPOCH remote data from Google Drive: {url}") | ||||
|     logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}") | ||||
| 
 | ||||
|     # Need to handle potential cookies/redirects from Google Drive | ||||
|     session = requests.Session() | ||||
|     session.headers.update({"User-Agent": "econ-agent/1.0"}) | ||||
|     response = session.get(url, stream=True) | ||||
|     response = session.get(URL, stream=True) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
|     csv_content = response.content | ||||
| 
 | ||||
|     logger.info(f"Creating new EPOCH remote data cache: {parquet_path}") | ||||
|     logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}") | ||||
|     df = pd.read_csv(io.BytesIO(csv_content)) | ||||
|     df.to_parquet(parquet_path) | ||||
|     logger.info(f"Saved EPOCH remote data to cache: {parquet_path}") | ||||
|     df.to_parquet(DATA_PATH) | ||||
| 
 | ||||
|     return df, version | ||||
|     return df | ||||
| 
 | ||||
| def fetch_metr_data(cache_dir: Path) -> Dict: | ||||
|     URL = "https://metr.org/assets/benchmark_results.yaml" | ||||
|     DATA_PATH = cache_dir / "metr_benchmark_results.yaml" | ||||
| 
 | ||||
|     if DATA_PATH.exists(): | ||||
|         logger.info(f"Using cached METR data: {DATA_PATH}") | ||||
|         with open(DATA_PATH, "r") as f: | ||||
|             return yaml.safe_load(f) | ||||
| 
 | ||||
|     logger.info(f"Downloading METR data from {URL}") | ||||
|     headers = {"User-Agent": "econ-agent/1.0"} | ||||
|     response = requests.get(URL, headers=headers) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
|     yaml_content = response.content | ||||
| 
 | ||||
|     logger.info(f"Creating new METR data cache: {DATA_PATH}") | ||||
|     with open(DATA_PATH, "wb") as f: | ||||
|         f.write(yaml_content) | ||||
| 
 | ||||
|     return yaml.safe_load(yaml_content) | ||||
|  |  | |||
|  | @ -1,5 +1,15 @@ | |||
| from .estimate_histplot import generate_estimate_histplot | ||||
| from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation | ||||
| from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter | ||||
| from .sequential_coherence_cdf import plot_sequential_coherence_cdf | ||||
| from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill | ||||
| from .projected_task_automation import generate_projected_task_automation_plot | ||||
| 
 | ||||
| GENERATORS = [ | ||||
|     generate_estimate_histplot | ||||
|     generate_estimate_histplot, | ||||
|     generate_estimate_spread_per_occupation, | ||||
|     generate_estimates_lower_vs_upper_scatter, | ||||
|     #plot_sequential_coherence_cdf, | ||||
|     generate_projected_automatable_wage_bill, | ||||
|     generate_projected_task_automation_plot, | ||||
| ] | ||||
|  |  | |||
|  | @ -1,6 +1,32 @@ | |||
| from ..run import Run | ||||
| from pathlib import Path | ||||
| from typing import Generator | ||||
| import matplotlib.pyplot as plt | ||||
| import seaborn as sns | ||||
| import pandas as pd | ||||
| from ..utils import style_plot | ||||
| 
 | ||||
| def generate_estimate_histplot(run: Run) -> Generator[Path]: | ||||
|     raise NotImplementedError | ||||
| def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]: | ||||
|     """ | ||||
|     Generates a styled histogram of the distribution of midpoint time estimates. | ||||
|     """ | ||||
|     style_plot() | ||||
|     OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png" | ||||
| 
 | ||||
|     fig, ax = plt.subplots() | ||||
| 
 | ||||
|     sns.histplot( | ||||
|         data=df, | ||||
|         x='estimate_midpoint', | ||||
|         log_scale=True, | ||||
|         ax=ax | ||||
|     ) | ||||
| 
 | ||||
|     ax.set_xlabel("Task Time (minutes, log scale)") | ||||
|     ax.set_ylabel("Number of Tasks") | ||||
|     ax.set_title("Distribution of Time Estimates for Atomic Tasks") | ||||
| 
 | ||||
|     plt.tight_layout() | ||||
|     plt.savefig(OUTPUT_PATH) | ||||
|     plt.close(fig) | ||||
| 
 | ||||
|     yield OUTPUT_PATH | ||||
|  |  | |||
							
								
								
									
										56
									
								
								pipeline/generators/estimates_lower_vs_upper_scatter.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,56 @@ | |||
| from pathlib import Path | ||||
| from typing import Generator | ||||
| import matplotlib.pyplot as plt | ||||
| import seaborn as sns | ||||
| import pandas as pd | ||||
| from ..utils import OCCUPATION_MAJOR_CODES, style_plot | ||||
| 
 | ||||
| 
 | ||||
| def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]: | ||||
|     """ | ||||
|     Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks. | ||||
|     """ | ||||
|     style_plot() | ||||
|     OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png" | ||||
| 
 | ||||
|     plot_df = df.copy() | ||||
|     # Replace onetsoc_major codes with their corresponding labels for the plot legend | ||||
|     plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES) | ||||
| 
 | ||||
|     fig, ax = plt.subplots(figsize=(12, 10)) | ||||
|     sns.scatterplot( | ||||
|             data=plot_df, | ||||
|             x='lb_estimate_in_minutes', | ||||
|             y='ub_estimate_in_minutes', | ||||
|             alpha=0.3, | ||||
|             edgecolor=None, | ||||
|             hue="onetsoc_major", | ||||
|             ax=ax | ||||
|         ) | ||||
| 
 | ||||
|     # 45° reference line (y=x) | ||||
|     lims = ( | ||||
|         min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()), | ||||
|         max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max()) | ||||
|     ) | ||||
|     lims = (lims[0] * 0.9, lims[1] * 1.1) | ||||
|     ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0) | ||||
| 
 | ||||
|     # Optional helper lines for ratios | ||||
|     for k in [2, 10, 100]: | ||||
|         ax.plot(lims, [k*l for l in lims], | ||||
|                 linestyle=':', color='grey', linewidth=1, zorder=0) | ||||
| 
 | ||||
|     ax.set_xscale('log') | ||||
|     ax.set_yscale('log') | ||||
|     ax.set_xlabel('Lower-bound (min, log scale)') | ||||
|     ax.set_ylabel('Upper-bound (min, log scale)') | ||||
|     ax.set_title('Lower vs Upper Estimates for All Tasks') | ||||
| 
 | ||||
|     ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left') | ||||
| 
 | ||||
|     plt.tight_layout() | ||||
|     plt.savefig(OUTPUT_PATH, bbox_inches='tight') | ||||
|     plt.close(fig) | ||||
| 
 | ||||
|     yield OUTPUT_PATH | ||||
							
								
								
									
										39
									
								
								pipeline/generators/estimates_spread_per_occupation.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,39 @@ | |||
| from pathlib import Path | ||||
| from typing import Generator | ||||
| import matplotlib.pyplot as plt | ||||
| import seaborn as sns | ||||
| import pandas as pd | ||||
| from ..utils import OCCUPATION_MAJOR_CODES, style_plot | ||||
| 
 | ||||
| 
 | ||||
| def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]: | ||||
|     """ | ||||
|     Generates a styled boxplot of the estimate range spread per major occupation group. | ||||
|     """ | ||||
|     style_plot() | ||||
|     OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png" | ||||
| 
 | ||||
|     fig, ax = plt.subplots(figsize=(10, 12)) | ||||
| 
 | ||||
|     sns.boxplot( | ||||
|         data=df, | ||||
|         x='onetsoc_major', | ||||
|         y='estimate_range', | ||||
|         showfliers=False, | ||||
|         ax=ax | ||||
|     ) | ||||
| 
 | ||||
|     ax.set_yscale('log') | ||||
|     ax.set_xlabel('Occupation') | ||||
|     ax.set_ylabel('Range (upper-lower, minutes)') | ||||
|     ax.set_title('Spread of time-range estimates per occupation') | ||||
| 
 | ||||
|     # Get occupation labels from codes for x-axis ticks | ||||
|     labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()] | ||||
|     ax.set_xticklabels(labels, rotation=60, ha='right') | ||||
| 
 | ||||
|     plt.tight_layout() | ||||
|     plt.savefig(OUTPUT_PATH) | ||||
|     plt.close(fig) | ||||
| 
 | ||||
|     yield OUTPUT_PATH | ||||
|  | @ -1,6 +0,0 @@ | |||
| import pandas as pd | ||||
| from typings import List | ||||
| 
 | ||||
| def must_have_columns(df: pd.DataFrame, columns: List[str]): | ||||
|     if not all(col in df.columns for col in columns): | ||||
|         raise ValueError(f"DataFrame is missing required columns: {columns}") | ||||
							
								
								
									
										229
									
								
								pipeline/generators/projected_automatable_wage_bill.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,229 @@ | |||
| from pathlib import Path | ||||
| from typing import Generator, Dict, Tuple, Optional | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import matplotlib.pyplot as plt | ||||
| import matplotlib.ticker as mticker | ||||
| from scipy.stats import linregress | ||||
| from datetime import datetime | ||||
| from ..utils import style_plot, LIME | ||||
| 
 | ||||
| def _generate_wage_projection_data( | ||||
|     metr_results: Dict, | ||||
|     df_with_wages: pd.DataFrame, | ||||
|     percentile_key: str, | ||||
|     doubling_time_modifier: float, | ||||
| ) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]: | ||||
|     """ | ||||
|     Generates wage projection data for different AI progress scenarios. | ||||
| 
 | ||||
|     Args: | ||||
|         metr_results: The METR benchmark data. | ||||
|         df_with_wages: DataFrame containing tasks with their estimated wage value. | ||||
|         percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length'). | ||||
|         doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline, | ||||
|                                   0.5 for optimistic, 2.0 for pessimistic). | ||||
| 
 | ||||
|     Returns: | ||||
|         A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient. | ||||
|     """ | ||||
|     all_model_data = [] | ||||
|     for model_name, data in metr_results.get("results", {}).items(): | ||||
|         for agent_name, agent_data in data.get("agents", {}).items(): | ||||
|             release_date_str = data.get("release_date") | ||||
|             horizon = agent_data.get(percentile_key, {}).get("estimate") | ||||
|             if release_date_str and horizon is not None: | ||||
|                 all_model_data.append({ | ||||
|                     "release_date": release_date_str, | ||||
|                     "horizon_minutes": horizon, | ||||
|                 }) | ||||
| 
 | ||||
|     if not all_model_data: | ||||
|         return None | ||||
| 
 | ||||
|     metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True) | ||||
|     metr_df['release_date'] = pd.to_datetime(metr_df['release_date']) | ||||
|     metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy() | ||||
| 
 | ||||
|     if len(metr_df) < 2: | ||||
|         return None | ||||
| 
 | ||||
|     metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days | ||||
|     log_y = np.log(metr_df['horizon_minutes']) | ||||
|     slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y) | ||||
| 
 | ||||
|     # Apply the scenario modifier to the doubling time | ||||
|     base_doubling_time_days = np.log(2) / slope | ||||
|     modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier | ||||
|     modified_slope = np.log(2) / modified_doubling_time_days | ||||
| 
 | ||||
|     start_date = metr_df['release_date'].min() | ||||
|     future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME")) | ||||
|     future_days = (future_dates - start_date).days.to_numpy() | ||||
| 
 | ||||
|     projected_log_horizon = intercept + modified_slope * future_days | ||||
|     projected_horizon_minutes = np.exp(projected_log_horizon) | ||||
| 
 | ||||
|     projection_df = pd.DataFrame({ | ||||
|         "date": future_dates, | ||||
|         "projected_coherence_minutes": projected_horizon_minutes, | ||||
|     }) | ||||
| 
 | ||||
|     # Calculate the total wage bill of tasks automated over time | ||||
|     for bound in ["lb", "mid", "ub"]: | ||||
|         col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes' | ||||
|         projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply( | ||||
|             lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum() | ||||
|         ) | ||||
| 
 | ||||
|     # Also calculate for the actual METR data points for plotting | ||||
|     metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply( | ||||
|          lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum() | ||||
|     ) | ||||
| 
 | ||||
|     return metr_df, projection_df, modified_doubling_time_days | ||||
| 
 | ||||
| 
 | ||||
| def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'): | ||||
|     """Helper function to draw a single projection scenario on a given axis.""" | ||||
|     # Plot the projected wage bill | ||||
|     ax.plot( | ||||
|         projection_df["date"], | ||||
|         projection_df["automatable_wage_bill_mid"], | ||||
|         label=label, | ||||
|         color=color, | ||||
|         linewidth=2.5, | ||||
|         linestyle=line_style, | ||||
|         zorder=3 | ||||
|     ) | ||||
|     # Plot the shaded range for lower/upper bounds | ||||
|     ax.fill_between( | ||||
|         projection_df["date"], | ||||
|         projection_df["automatable_wage_bill_lb"], | ||||
|         projection_df["automatable_wage_bill_ub"], | ||||
|         color=color, | ||||
|         alpha=0.15, | ||||
|         zorder=2 | ||||
|     ) | ||||
|     # Plot the actual METR data points against the wage bill | ||||
|     ax.scatter( | ||||
|         metr_df['release_date'], | ||||
|         metr_df['automatable_wage_bill_mid'], | ||||
|         color=color, | ||||
|         edgecolor='black', | ||||
|         s=60, | ||||
|         zorder=4, | ||||
|         label=f"Model Capabilities (P50)" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def generate_projected_automatable_wage_bill( | ||||
|     output_dir: Path, | ||||
|     df: pd.DataFrame, | ||||
|     task_summary_by_occupation_df: pd.DataFrame, | ||||
|     metr_results: Dict, | ||||
|     **kwargs, | ||||
| ) -> Generator[Path, None, None]: | ||||
|     """ | ||||
|     Generates a plot projecting the automatable wage bill under different | ||||
|     AI progress scenarios (optimistic, baseline, pessimistic). | ||||
|     """ | ||||
|     style_plot() | ||||
|     OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png" | ||||
| 
 | ||||
|     # 1. Calculate wage_per_task for each occupation | ||||
|     wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy() | ||||
|     wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks'] | ||||
|     wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues | ||||
|     wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True) | ||||
| 
 | ||||
|     # 2. Merge wage_per_task into the main task dataframe | ||||
|     df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left') | ||||
|     df_with_wages['wage_per_task'].fillna(0, inplace=True) | ||||
| 
 | ||||
|     # 3. Generate data for all three scenarios | ||||
|     scenarios = { | ||||
|         "Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"}, | ||||
|         "Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"}, | ||||
|         "Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"}, | ||||
|     } | ||||
| 
 | ||||
|     projection_results = {} | ||||
|     for name, config in scenarios.items(): | ||||
|         result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier']) | ||||
|         if result: | ||||
|             projection_results[name] = result | ||||
| 
 | ||||
|     if not projection_results: | ||||
|         print("Warning: Could not generate any projection data. Skipping wage bill plot.") | ||||
|         return | ||||
| 
 | ||||
|     # 4. Create the plot | ||||
|     fig, ax = plt.subplots(figsize=(14, 9)) | ||||
| 
 | ||||
|     # We only need to plot the scatter points once, let's use the baseline ones. | ||||
|     if "Baseline" in projection_results: | ||||
|         metr_df, _, _ = projection_results["Baseline"] | ||||
|         ax.scatter( | ||||
|             metr_df['release_date'], | ||||
|             metr_df['automatable_wage_bill_mid'], | ||||
|             color='black', | ||||
|             s=80, | ||||
|             zorder=5, | ||||
|             label=f"Model Capabilities (P50)" | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
|     legend_lines = [] | ||||
|     for name, (metr_df, proj_df, doubling_time) in projection_results.items(): | ||||
|         config = scenarios[name] | ||||
|         ax.plot( | ||||
|             proj_df["date"], | ||||
|             proj_df["automatable_wage_bill_mid"], | ||||
|             color=config['color'], | ||||
|             linestyle=config['style'], | ||||
|             linewidth=2.5, | ||||
|             zorder=3 | ||||
|         ) | ||||
|         ax.fill_between( | ||||
|             proj_df["date"], | ||||
|             proj_df["automatable_wage_bill_lb"], | ||||
|             proj_df["automatable_wage_bill_ub"], | ||||
|             color=config['color'], | ||||
|             alpha=0.15, | ||||
|             zorder=2 | ||||
|         ) | ||||
|         # Create a custom line for the legend | ||||
|         line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5, | ||||
|                           label=f'{name} (Doubling Time: {doubling_time:.0f} days)') | ||||
|         legend_lines.append(line) | ||||
| 
 | ||||
| 
 | ||||
|     # 5. Styling and annotations | ||||
|     ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20) | ||||
|     ax.set_xlabel("Year", fontsize=12) | ||||
|     ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12) | ||||
| 
 | ||||
|     # Format Y-axis to show trillions | ||||
|     def trillions_formatter(x, pos): | ||||
|         return f'${x / 1e12:.1f}T' | ||||
|     ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter)) | ||||
| 
 | ||||
|     total_wage_bill = df_with_wages['wage_per_task'].sum() | ||||
|     ax.set_ylim(0, total_wage_bill * 1.05) | ||||
| 
 | ||||
|     if "Baseline" in projection_results: | ||||
|          _, proj_df, _ = projection_results["Baseline"] | ||||
|          ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max()) | ||||
| 
 | ||||
|     # Create the legend from the custom lines and the scatter plot | ||||
|     scatter_legend = ax.get_legend_handles_labels()[0] | ||||
|     ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11) | ||||
| 
 | ||||
|     ax.grid(True, which="both", linestyle="--", linewidth=0.5) | ||||
|     plt.tight_layout() | ||||
|     plt.savefig(OUTPUT_PATH) | ||||
|     plt.close(fig) | ||||
| 
 | ||||
|     print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}") | ||||
|     yield OUTPUT_PATH | ||||
							
								
								
									
										168
									
								
								pipeline/generators/projected_task_automation.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,168 @@ | |||
| from pathlib import Path | ||||
| from typing import Generator, Dict, Tuple | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| import matplotlib.pyplot as plt | ||||
| from scipy.stats import linregress | ||||
| from datetime import datetime | ||||
| from ..utils import style_plot, LIME | ||||
| 
 | ||||
| def _generate_projection_data( | ||||
|     metr_results: Dict, | ||||
|     df: pd.DataFrame, | ||||
|     percentile_key: str, | ||||
| ) -> Tuple[pd.DataFrame, pd.DataFrame] | None: | ||||
|     """ | ||||
|     Generates projection data for a given percentile key (e.g., 'p50_horizon_length'). | ||||
|     Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient. | ||||
|     """ | ||||
|     # 1. Process METR data to get all model performance over time for the given percentile | ||||
|     all_model_data = [] | ||||
|     for model_name, data in metr_results.get("results", {}).items(): | ||||
|         for agent_name, agent_data in data.get("agents", {}).items(): | ||||
|             release_date_str = data.get("release_date") | ||||
|             horizon = agent_data.get(percentile_key, {}).get("estimate") | ||||
| 
 | ||||
|             if release_date_str and horizon is not None: | ||||
|                 unique_model_name = f"{model_name}-{agent_name}" | ||||
|                 all_model_data.append({ | ||||
|                     "model": unique_model_name, | ||||
|                     "release_date": release_date_str, | ||||
|                     "horizon_minutes": horizon, | ||||
|                 }) | ||||
| 
 | ||||
|     if not all_model_data: | ||||
|         print(f"Warning: No models with {percentile_key} found in METR data. Skipping.") | ||||
|         return None | ||||
| 
 | ||||
|     metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True) | ||||
|     metr_df['release_date'] = pd.to_datetime(metr_df['release_date']) | ||||
| 
 | ||||
|     # 2. Perform log-linear regression on coherence over time | ||||
|     metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy() | ||||
|     if len(metr_df) < 2: | ||||
|         print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.") | ||||
|         return None | ||||
| 
 | ||||
|     metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days | ||||
|     log_y = np.log(metr_df['horizon_minutes']) | ||||
|     x = metr_df['days_since_start'] | ||||
| 
 | ||||
|     slope, intercept, r_value, _, _ = linregress(x, log_y) | ||||
|     doubling_time_days = np.log(2) / slope | ||||
|     print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days") | ||||
| 
 | ||||
|     # 3. Project coherence into the future | ||||
|     start_date = metr_df['release_date'].min() | ||||
|     future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME")) | ||||
|     future_days = (future_dates - start_date).days.to_numpy() | ||||
| 
 | ||||
|     projected_log_horizon = intercept + slope * future_days | ||||
|     projected_horizon_minutes = np.exp(projected_log_horizon) | ||||
| 
 | ||||
|     projection_df = pd.DataFrame({ | ||||
|         "date": future_dates, | ||||
|         "projected_coherence_minutes": projected_horizon_minutes, | ||||
|     }) | ||||
| 
 | ||||
|     # 4. Calculate the percentage of tasks automated over time based on our estimates | ||||
|     total_tasks = len(df) | ||||
|     if total_tasks == 0: | ||||
|         return None | ||||
| 
 | ||||
|     for bound in ["lb", "mid", "ub"]: | ||||
|         col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes' | ||||
|         projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply( | ||||
|             lambda h: (df[col_name] <= h).sum() / total_tasks * 100 | ||||
|         ) | ||||
| 
 | ||||
|     metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply( | ||||
|          lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100 | ||||
|     ) | ||||
| 
 | ||||
|     return metr_df, projection_df | ||||
| 
 | ||||
| 
 | ||||
| def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'): | ||||
|     """Helper function to draw a single projection on a given axis.""" | ||||
|     # Plot the projected automation percentage | ||||
|     ax.plot( | ||||
|         projection_df["date"], | ||||
|         projection_df["pct_automatable_mid"], | ||||
|         label=f"Mid-point", | ||||
|         color=color, | ||||
|         linewidth=2.5, | ||||
|         linestyle=line_style, | ||||
|         zorder=3 | ||||
|     ) | ||||
|     ax.fill_between( | ||||
|         projection_df["date"], | ||||
|         projection_df["pct_automatable_lb"], | ||||
|         projection_df["pct_automatable_ub"], | ||||
|         color=color, | ||||
|         alpha=0.15, | ||||
|         label=f"Lower/upper bound range", | ||||
|         zorder=2 | ||||
|     ) | ||||
|     # Plot the actual METR data points | ||||
|     ax.scatter( | ||||
|         metr_df['release_date'], | ||||
|         metr_df['pct_automatable_mid'], | ||||
|         color=color, | ||||
|         edgecolor='black', | ||||
|         s=60, | ||||
|         zorder=4, | ||||
|         label=f"Model with {label[1:]}% success rate" | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def generate_projected_task_automation_plot( | ||||
|     output_dir: Path, | ||||
|     metr_results: Dict, | ||||
|     df: pd.DataFrame, | ||||
|     **kwargs, | ||||
| ) -> Generator[Path, None, None]: | ||||
|     """ | ||||
|     Generates plots projecting task automation based on METR's p50 and p80 | ||||
|     coherence data. | ||||
|     """ | ||||
|     style_plot() | ||||
| 
 | ||||
|     p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length') | ||||
|     p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length') | ||||
| 
 | ||||
|     # Plot P50 alone | ||||
|     if p50_data: | ||||
|         p50_metr_df, p50_proj_df = p50_data | ||||
|         fig, ax = plt.subplots(figsize=(12, 8)) | ||||
|         _plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600']) | ||||
|         ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20) | ||||
|         ax.set_xlabel("Year") | ||||
|         ax.set_ylabel("% of task automatable (50% success rate)") | ||||
|         ax.set_ylim(0, 100.5) | ||||
|         ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max()) | ||||
|         ax.grid(True, which="both", linestyle="--", linewidth=0.5) | ||||
|         ax.legend(loc="upper left") | ||||
|         plt.tight_layout() | ||||
|         output_path = output_dir / "projected_task_automation_p50.png" | ||||
|         plt.savefig(output_path) | ||||
|         plt.close(fig) | ||||
|         yield output_path | ||||
| 
 | ||||
|     # Plot P80 alone | ||||
|     if p80_data: | ||||
|         p80_metr_df, p80_proj_df = p80_data | ||||
|         fig, ax = plt.subplots(figsize=(12, 8)) | ||||
|         _plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan') | ||||
|         ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20) | ||||
|         ax.set_xlabel("Year") | ||||
|         ax.set_ylabel("% of Estimable Economic Tasks Automatable") | ||||
|         ax.set_ylim(0, 100.5) | ||||
|         ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max()) | ||||
|         ax.grid(True, which="both", linestyle="--", linewidth=0.5) | ||||
|         ax.legend(loc="upper left") | ||||
|         plt.tight_layout() | ||||
|         output_path = output_dir / "projected_task_automation_p80.png" | ||||
|         plt.savefig(output_path) | ||||
|         plt.close(fig) | ||||
|         yield output_path | ||||
							
								
								
									
										54
									
								
								pipeline/generators/sequential_coherence_cdf.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,54 @@ | |||
| from pathlib import Path | ||||
| import pandas as pd | ||||
| import matplotlib.pyplot as plt | ||||
| import matplotlib.ticker as mtick | ||||
| from ..utils import LIME, style_plot | ||||
| 
 | ||||
| def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs): | ||||
|     style_plot() | ||||
|     output_path = output_dir / "sequential_coherence_cdf.png" | ||||
| 
 | ||||
|     def cdf(series): | ||||
|         """Helper function to calculate CDF data.""" | ||||
|         s = series.sort_values().reset_index(drop=True) | ||||
|         # Calculate cumulative percentage | ||||
|         return s.values, ((s.index + 1) / len(s)) * 100 | ||||
| 
 | ||||
|     # Calculate CDF for lower, upper, and midpoint estimates | ||||
|     x_lb, y_lb = cdf(df['lb_estimate_in_minutes']) | ||||
|     x_ub, y_ub = cdf(df['ub_estimate_in_minutes']) | ||||
|     x_mid, y_mid = cdf(df['estimate_midpoint']) | ||||
| 
 | ||||
|     # Create the plot | ||||
|     fig, ax = plt.subplots(figsize=(12, 7)) | ||||
| 
 | ||||
|     # Plot the CDFs as step plots | ||||
|     ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate') | ||||
|     ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate') | ||||
|     ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point') | ||||
| 
 | ||||
|     # --- Styling and Annotations --- | ||||
|     ax.set_xscale('log') | ||||
|     ax.set_ylim(0, 100) | ||||
|     ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0)) | ||||
| 
 | ||||
|     # Set titles and labels using the standard axes methods | ||||
|     ax.set_title("% of Tasks With Sequential Coherence ≤ X") | ||||
|     ax.set_xlabel("Sequential Coherence (X)") | ||||
|     ax.set_ylabel("Cumulative Percentage of Tasks") | ||||
| 
 | ||||
|     # Define custom x-axis ticks and labels for better readability | ||||
|     ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600] | ||||
|     ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days', | ||||
|  '1 wk', '30 days', '90 days', '180 days', '1 yr'] | ||||
|     ax.set_xticks(ticks) | ||||
|     ax.set_xticklabels(ticklabels, rotation=45, ha='right') | ||||
| 
 | ||||
|     ax.legend(loc='lower right') | ||||
| 
 | ||||
|     # --- Save and close --- | ||||
|     plt.tight_layout() | ||||
|     plt.savefig(output_path, bbox_inches='tight') | ||||
|     plt.close(fig) | ||||
| 
 | ||||
|     yield output_path | ||||
|  | @ -1,41 +0,0 @@ | |||
| """ | ||||
| This module defines the Metadata model for the pipeline. | ||||
| """ | ||||
| 
 | ||||
| from datetime import datetime | ||||
| from pydantic import BaseModel, Field | ||||
| from typing import Dict, Any | ||||
| 
 | ||||
| class Metadata(BaseModel): | ||||
|     """ | ||||
|     A Pydantic model for storing pipeline metadata. | ||||
| 
 | ||||
|     This class is intended to be instantiated once and passed through the | ||||
|     pipeline. Each step in the pipeline can then add its own metadata. | ||||
|     This provides a centralized and structured way to track data provenance, | ||||
|     versions, and other important information. | ||||
|     """ | ||||
|     fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict) | ||||
|     enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict) | ||||
| 
 | ||||
|     ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S")) | ||||
|     commit: str = Field(default_factory=lambda: _get_current_commit()) | ||||
| 
 | ||||
| 
 | ||||
| def _get_current_commit() -> str: | ||||
|     """ | ||||
|     Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved. | ||||
|     """ | ||||
|     import subprocess | ||||
|     try: | ||||
|         # Get the current commit hash | ||||
|         commit_hash = subprocess.check_output( | ||||
|             ["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True | ||||
|         ).strip() | ||||
|         return commit_hash | ||||
|     except subprocess.CalledProcessError: | ||||
|         # If git command fails (e.g., not a git repository) | ||||
|         return "errored" | ||||
|     except FileNotFoundError: | ||||
|         # If git is not installed | ||||
|         return "unknown" | ||||
|  | @ -1,140 +0,0 @@ | |||
| from .run import Run | ||||
| from .logger import logger | ||||
| import pandas as pd | ||||
| import numpy as np | ||||
| 
 | ||||
| 
 | ||||
| def check_for_insanity(run: Run) -> Run: | ||||
|     raise NotImplementedError | ||||
| 
 | ||||
| 
 | ||||
| def create_df_tasks(run: Run) -> Run: | ||||
|     """ | ||||
|     Creates a dataframe of tasks from the O*NET database, and merges it with remote status data. | ||||
|     This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py | ||||
| 
 | ||||
|     The resulting dataframe, `run.df_tasks` will be used by the enrichment steps. | ||||
|     """ | ||||
|     logger.info("Creating tasks dataframe") | ||||
|     cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet" | ||||
|     if cache_path.exists(): | ||||
|         logger.info(f"Loading cached tasks dataframe from {cache_path}") | ||||
|         run.df_tasks = pd.read_parquet(cache_path) | ||||
|         return run | ||||
| 
 | ||||
|     query = """ | ||||
|     SELECT | ||||
|         tr.onetsoc_code, | ||||
|         tr.task_id, | ||||
|         ts.task, | ||||
|         od.title AS occupation_title, | ||||
|         od.description AS occupation_description, | ||||
|         tr.scale_id, | ||||
|         tr.category, | ||||
|         tr.data_value, | ||||
|         dr.dwa_title | ||||
|     FROM | ||||
|         task_ratings tr | ||||
|     JOIN | ||||
|         task_statements ts ON tr.task_id = ts.task_id | ||||
|     JOIN | ||||
|         occupation_data od ON tr.onetsoc_code = od.onetsoc_code | ||||
|     LEFT JOIN | ||||
|         tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id | ||||
|     LEFT JOIN | ||||
|         dwa_reference dr ON td.dwa_id = dr.dwa_id; | ||||
|     """ | ||||
|     df = pd.read_sql_query(query, run.onet_conn) | ||||
|     logger.info(f"Fetched {len(df)} records (including DWA info) from the database.") | ||||
| 
 | ||||
|     # Separate ratings from DWAs | ||||
|     core_cols = [ | ||||
|         "onetsoc_code", "task_id", "task", "occupation_title", | ||||
|         "occupation_description", "scale_id", "category", "data_value" | ||||
|     ] | ||||
|     ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True) | ||||
| 
 | ||||
|     dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] | ||||
|     dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True) | ||||
| 
 | ||||
|     # 1. Handle Frequency (FT) | ||||
|     logger.info("Processing Frequency data") | ||||
|     freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() | ||||
|     if not freq_df.empty: | ||||
|         freq_pivot = freq_df.pivot_table( | ||||
|             index=["onetsoc_code", "task_id"], | ||||
|             columns="category", | ||||
|             values="data_value", | ||||
|             fill_value=0, | ||||
|         ) | ||||
|         freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns] | ||||
|     else: | ||||
|         idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]) | ||||
|         freq_pivot = pd.DataFrame(index=idx) | ||||
| 
 | ||||
|     # 2. Handle Importance (IM, IJ) | ||||
|     logger.info("Processing Importance data") | ||||
|     imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() | ||||
|     if not imp_df.empty: | ||||
|         imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() | ||||
|         imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) | ||||
|     else: | ||||
|         imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"]) | ||||
| 
 | ||||
|     # 3. Handle Relevance (RT) | ||||
|     logger.info("Processing Relevance data") | ||||
|     rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() | ||||
|     if not rel_df.empty: | ||||
|         rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() | ||||
|         rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) | ||||
|     else: | ||||
|         rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"]) | ||||
| 
 | ||||
|     # 4. Process DWAs | ||||
|     logger.info("Processing DWA data") | ||||
|     if not dwas_df.empty: | ||||
|         dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index() | ||||
|         dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True) | ||||
|     else: | ||||
|         dwas_grouped = None | ||||
| 
 | ||||
|     # 5. Get Base Task/Occupation Info | ||||
|     logger.info("Extracting base task/occupation info") | ||||
|     base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"] | ||||
|     base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"]) | ||||
| 
 | ||||
|     # 6. Merge Processed ONET Data | ||||
|     logger.info("Merging processed ONET data") | ||||
|     final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left") | ||||
|     final_df = final_df.reset_index() | ||||
| 
 | ||||
|     if not imp_avg.empty: | ||||
|         final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") | ||||
|     else: | ||||
|         final_df["importance_average"] = np.nan | ||||
| 
 | ||||
|     if not rel_avg.empty: | ||||
|         final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") | ||||
|     else: | ||||
|         final_df["relevance_average"] = np.nan | ||||
| 
 | ||||
|     if dwas_grouped is not None and not dwas_grouped.empty: | ||||
|         final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left") | ||||
|         if "dwas" in final_df.columns: | ||||
|             final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else []) | ||||
|     else: | ||||
|         final_df["dwas"] = [[] for _ in range(len(final_df))] | ||||
| 
 | ||||
|     final_df = final_df.replace({np.nan: None}) | ||||
| 
 | ||||
|     # 7. Merge with EPOCH remote data | ||||
|     logger.info("Merging with EPOCH remote data") | ||||
|     final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left') | ||||
|     final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) | ||||
| 
 | ||||
| 
 | ||||
|     logger.info(f"Created tasks dataframe with shape {final_df.shape}") | ||||
|     final_df.to_parquet(cache_path) | ||||
| 
 | ||||
|     run.df_tasks = final_df | ||||
|     return run | ||||
|  | @ -1,27 +0,0 @@ | |||
| from pydantic import BaseModel, Field | ||||
| import sqlite3 | ||||
| import pandas as pd | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| from .metadata import Metadata | ||||
| 
 | ||||
| class Run(BaseModel): | ||||
|     model_config = {"arbitrary_types_allowed": True} | ||||
|     # === FETCHERS === | ||||
|     onet_conn: Optional[sqlite3.Connection] = None | ||||
|     onet_version: Optional[str] = None | ||||
| 
 | ||||
|     oesm_df: Optional[pd.DataFrame] = None | ||||
|     oesm_version: Optional[str] = None | ||||
| 
 | ||||
|     epoch_df: Optional[pd.DataFrame] = None | ||||
|     epoch_version: Optional[str] = None | ||||
| 
 | ||||
|     # === ENRICHMENTS === | ||||
|     task_estimateability_df: Optional[pd.DataFrame] = None | ||||
|     task_estimates_df: Optional[pd.DataFrame] = None | ||||
| 
 | ||||
|     meta: Metadata = Field(default_factory=Metadata) | ||||
| 
 | ||||
|     cache_dir: Path | ||||
|     output_dir: Path | ||||
|  | @ -1,74 +1,215 @@ | |||
| import sqlite3 | ||||
| import os | ||||
| from .logger import logger | ||||
| import pandas as pd | ||||
| from dotenv import load_dotenv | ||||
| from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database | ||||
| from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates | ||||
| from .postprocessors import check_for_insanity, create_df_tasks | ||||
| from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data | ||||
| from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks | ||||
| from .generators import GENERATORS | ||||
| from .run import Run | ||||
| from .constants import GRAY | ||||
| from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code | ||||
| from .utils import convert_to_minutes | ||||
| import argparse | ||||
| import platformdirs | ||||
| import seaborn as sns | ||||
| import matplotlib as mpl | ||||
| import numpy as np | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| 
 | ||||
| CACHE_DIR = platformdirs.user_cache_dir("econtai") | ||||
| 
 | ||||
| def run(output_dir: Path | Optional[str] = None): | ||||
|     load_dotenv() | ||||
|     _setup_graph_rendering() | ||||
| class Runner: | ||||
|     onet_conn: sqlite3.Connection | ||||
|     oesm_df: pd.DataFrame | ||||
|     epoch_df: pd.DataFrame | ||||
|     metr_results: dict | ||||
| 
 | ||||
|     if output_dir is None: | ||||
|         output_dir = Path("dist/") | ||||
|     elif isinstance(output_dir, str): | ||||
|     def __init__(self,  output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool): | ||||
|         if isinstance(output_dir, str): | ||||
|             output_dir = Path(output_dir).resolve() | ||||
| 
 | ||||
|         output_dir.mkdir(parents=True, exist_ok=True) | ||||
| 
 | ||||
|     current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve()) | ||||
|     current_run.cache_dir.mkdir(parents=True, exist_ok=True) | ||||
|         self.output_dir = output_dir | ||||
|         self.intermediate_dir = self.output_dir / "intermediate" | ||||
|         self.intermediate_dir.mkdir(parents=True, exist_ok=True) | ||||
|         self.cache_dir = platformdirs.user_cache_path("econtai") | ||||
|         self.debug = debug | ||||
|         self.bust_estimability = bust_estimability | ||||
|         self.bust_estimates = bust_estimates | ||||
| 
 | ||||
|     # Fetchers (fetchers.py) | ||||
|     current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run) | ||||
|     current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run) | ||||
|     current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run) | ||||
|         if debug: | ||||
|             os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO") | ||||
| 
 | ||||
|     current_run = create_df_tasks(current_run) | ||||
|     def run(self): | ||||
|         load_dotenv() | ||||
| 
 | ||||
|     # Enrichments (enrichments.py) | ||||
|     current_run.task_estimateability_df = enrich_with_task_estimateability(current_run) | ||||
|     current_run.task_estimates_df = enrich_with_task_estimates(current_run) | ||||
|         self.onet_conn = fetch_onet_database(self.cache_dir) | ||||
|         self.oesm_df = fetch_oesm_data(self.cache_dir) | ||||
|         self.epoch_df = fetch_epoch_remote_data(self.cache_dir) | ||||
|         self.metr_results = fetch_metr_data(self.cache_dir) | ||||
| 
 | ||||
|     # Postprocessors (postprocessors.py) | ||||
|     check_for_insanity(current_run) | ||||
|         self.df_tasks = self._create_df_tasks() | ||||
|         self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2] | ||||
| 
 | ||||
|         df_to_process = self.df_tasks[ | ||||
|             (self.df_tasks['importance_average'] > 3) & | ||||
|             (self.df_tasks['remote_status'] == 'remote') | ||||
|         ].copy() | ||||
| 
 | ||||
|         if self.debug: | ||||
|             df_to_process = df_to_process.head(10) | ||||
| 
 | ||||
|         task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability) | ||||
|         self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left') | ||||
|         self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False) | ||||
|         self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet") | ||||
|         df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left') | ||||
|         df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False) | ||||
| 
 | ||||
|         df_to_process = df_to_process[df_to_process['estimable']].copy() | ||||
| 
 | ||||
|         task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates) | ||||
|         df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left') | ||||
|         df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1) | ||||
|         df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1) | ||||
|         df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes | ||||
|         df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None) | ||||
|         df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2 | ||||
| 
 | ||||
|         df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet") | ||||
| 
 | ||||
|         self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df) | ||||
|         self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet") | ||||
|         self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df) | ||||
|         self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet") | ||||
| 
 | ||||
|         self._check_for_insanity(df) | ||||
| 
 | ||||
|     # Generators (generators/) | ||||
|         for gen in GENERATORS: | ||||
|         gen(current_run) | ||||
|             for asset in gen(**{ | ||||
|                 "output_dir": self.output_dir, | ||||
|                 "runner": self, | ||||
|                 "df": df, | ||||
|                 "task_summary_by_occupation_df": self.task_summary_by_occupation_df, | ||||
|                 "task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df, | ||||
|                 "df_tasks": self.df_tasks, | ||||
|                 "oesm_df": self.oesm_df, | ||||
|                 "metr_results": self.metr_results, | ||||
|             }): | ||||
|                 logger.info(f"New asset: {asset}") | ||||
| 
 | ||||
|     def _create_df_tasks(self) -> pd.DataFrame: | ||||
|         DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet" | ||||
|         if DATA_PATH.exists(): | ||||
|             logger.info(f"Loading cached tasks dataframe from {DATA_PATH}") | ||||
|             return pd.read_parquet(DATA_PATH) | ||||
| 
 | ||||
| def _setup_graph_rendering(): | ||||
|     mpl.rcParams.update({ | ||||
|         'figure.facecolor' : GRAY['50'], | ||||
|         'axes.facecolor'   : GRAY['50'], | ||||
|         'axes.edgecolor'   : GRAY['100'], | ||||
|         'axes.labelcolor'  : GRAY['700'], | ||||
|         'xtick.color'      : GRAY['700'], | ||||
|         'ytick.color'      : GRAY['700'], | ||||
|         'font.family'      : 'Inter', | ||||
|         'font.size'        : 11, | ||||
|     }) | ||||
|         logger.info("Creating tasks dataframe") | ||||
|         query = """ | ||||
|         SELECT | ||||
|         tr.onetsoc_code, | ||||
|         tr.task_id, | ||||
|         ts.task, | ||||
|         od.title AS occupation_title, | ||||
|         od.description AS occupation_description, | ||||
|         tr.scale_id, | ||||
|         tr.category, | ||||
|         tr.data_value | ||||
|         FROM | ||||
|         task_ratings tr | ||||
|         JOIN | ||||
|         task_statements ts ON tr.task_id = ts.task_id | ||||
|         JOIN | ||||
|         occupation_data od ON tr.onetsoc_code = od.onetsoc_code; | ||||
|         """ | ||||
|         ratings_df = pd.read_sql_query(query, self.onet_conn) | ||||
|         logger.info(f"Fetched {len(ratings_df)} task rating records from the database.") | ||||
| 
 | ||||
|         # 1. Handle Frequency (FT) | ||||
|         logger.info("Processing Frequency data") | ||||
|         freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() | ||||
|         if not freq_df.empty: | ||||
|             freq_pivot = freq_df.pivot_table( | ||||
|                 index=["onetsoc_code", "task_id"], | ||||
|                 columns="category", | ||||
|                 values="data_value", | ||||
|                 fill_value=0, | ||||
|             ) | ||||
|             freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns] | ||||
|         else: | ||||
|             raise ValueError("No frequency data.") | ||||
| 
 | ||||
|     sns.set_style("white") | ||||
|         # 2. Handle Importance (IM, IJ) | ||||
|         logger.info("Processing Importance data") | ||||
|         imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() | ||||
|         if not imp_df.empty: | ||||
|             imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() | ||||
|             imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) | ||||
|         else: | ||||
|             raise ValueError("No importance data.") | ||||
| 
 | ||||
|         # 3. Handle Relevance (RT) | ||||
|         logger.info("Processing Relevance data") | ||||
|         rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() | ||||
|         if not rel_df.empty: | ||||
|             rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() | ||||
|             rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) | ||||
|         else: | ||||
|             raise ValueError("No relevance data.") | ||||
| 
 | ||||
| def main(): | ||||
|     parser = argparse.ArgumentParser(description="Run the econtai pipeline.") | ||||
|     parser.add_argument("--output-dir", type=str, help="The directory to write output files to.") | ||||
|     args = parser.parse_args() | ||||
|     run(output_dir=args.output_dir) | ||||
|         # 5. Get Base Task/Occupation Info | ||||
|         logger.info("Extracting base task/occupation info") | ||||
|         base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"] | ||||
|         base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"]) | ||||
| 
 | ||||
|         # 6. Merge Processed ONET Data | ||||
|         logger.info("Merging processed ONET data") | ||||
|         final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left") | ||||
|         final_df = final_df.reset_index() | ||||
| 
 | ||||
|         if not imp_avg.empty: | ||||
|             final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") | ||||
|         else: | ||||
|             final_df["importance_average"] = np.nan | ||||
| 
 | ||||
|         if not rel_avg.empty: | ||||
|             final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") | ||||
|         else: | ||||
|             final_df["relevance_average"] = np.nan | ||||
| 
 | ||||
|         final_df = final_df.replace({np.nan: None}) | ||||
| 
 | ||||
|         # 7. Merge with EPOCH remote data | ||||
|         logger.info("Merging with EPOCH remote data") | ||||
|         final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left') | ||||
|         final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) | ||||
| 
 | ||||
|         logger.info(f"Created tasks dataframe with shape {final_df.shape}") | ||||
|         final_df.to_parquet(DATA_PATH) | ||||
| 
 | ||||
|         return final_df | ||||
| 
 | ||||
|     def _check_for_insanity(self, df: pd.DataFrame): | ||||
|         if df['lb_estimate_in_minutes'].isnull().any(): | ||||
|             missing_count = df['lb_estimate_in_minutes'].isnull().sum() | ||||
|             raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.") | ||||
| 
 | ||||
|         if df['ub_estimate_in_minutes'].isnull().any(): | ||||
|             missing_count = df['ub_estimate_in_minutes'].isnull().sum() | ||||
|             raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.") | ||||
| 
 | ||||
|         valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes']) | ||||
|         impossible_bounds = valid_estimates[ | ||||
|             (valid_estimates['lb_estimate_in_minutes'] <= 0) | | ||||
|             (valid_estimates['ub_estimate_in_minutes'] <= 0) | | ||||
|             (valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes']) | ||||
|         ] | ||||
|         if not impossible_bounds.empty: | ||||
|             raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).") | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
|     parser = argparse.ArgumentParser(description="Run the econtai pipeline.") | ||||
|     parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.") | ||||
|     parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)") | ||||
|     parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)") | ||||
|     parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).") | ||||
| 
 | ||||
|     args = parser.parse_args() | ||||
|     Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run() | ||||
|  |  | |||
							
								
								
									
										222
									
								
								pipeline/utils.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						|  | @ -0,0 +1,222 @@ | |||
| import subprocess | ||||
| import matplotlib.colors as mcolors | ||||
| import matplotlib as mpl | ||||
| import seaborn as sns | ||||
| import tempfile | ||||
| import litellm | ||||
| import time | ||||
| import math | ||||
| from tqdm import tqdm | ||||
| from typing import Any, List, Dict | ||||
| from .logger import logger | ||||
| 
 | ||||
| OCCUPATION_MAJOR_CODES = { | ||||
|     '11': 'Management', | ||||
|     '13': 'Business & Financial', | ||||
|     '15': 'Computer & Mathematical', | ||||
|     '17': 'Architecture & Engineering', | ||||
|     '19': 'Life, Physical, & Social Science', | ||||
|     '21': 'Community & Social Service', | ||||
|     '23': 'Legal', | ||||
|     '25': 'Education, Training, & Library', | ||||
|     '27': 'Arts, Design, & Media', | ||||
|     '29': 'Healthcare Practitioners', | ||||
|     '31': 'Healthcare Support', | ||||
|     '33': 'Protective Service', | ||||
|     '35': 'Food Preparation & Serving', | ||||
|     '37': 'Building & Grounds Maintenance', | ||||
|     '39': 'Personal Care & Service', | ||||
|     '41': 'Sales & Related', | ||||
|     '43': 'Office & Admin Support', | ||||
|     '45': 'Farming, Fishing, & Forestry', | ||||
|     '47': 'Construction & Extraction', | ||||
|     '49': 'Installation, Maintenance, & Repair', | ||||
|     '51': 'Production', | ||||
|     '53': 'Transportation & Material Moving', | ||||
|     '55': 'Military Specific', | ||||
| } | ||||
| 
 | ||||
| GRAY   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', | ||||
|     '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', | ||||
|     '600':'#475569','700':'#334155','800':'#1e293b', | ||||
|     '900':'#0f172a','950':'#020617'} | ||||
| 
 | ||||
| LIME            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', | ||||
|     '300': '#bbf451','400': '#9ae600','500': '#83cd00', | ||||
|     '600': '#64a400','700': '#497d00','800': '#3c6300', | ||||
|     '900': '#35530e','950': '#192e03'} | ||||
| 
 | ||||
| 
 | ||||
| def convert_to_minutes(qty, unit): | ||||
|     """Converts a quantity in a given unit to minutes.""" | ||||
|     return qty * { | ||||
|         "minute": 1, | ||||
|         "hour": 60, | ||||
|         "day": 60 * 24, | ||||
|         "week": 60 * 24 * 7, | ||||
|         "month": 60 * 24 * 30, | ||||
|         "trimester": 60 * 24 * 90, | ||||
|         "semester": 60 * 24 * 180, | ||||
|         "year": 60 * 24 * 365, | ||||
|     }[unit] | ||||
| 
 | ||||
| 
 | ||||
| def pretty_display(df): | ||||
|     print(df) | ||||
|     return | ||||
|     html_output = df.to_html(index=False) | ||||
| 
 | ||||
|     # Create a temporary HTML file | ||||
|     with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file: | ||||
|         temp_file.write(html_output) | ||||
|         temp_file_path = temp_file.name | ||||
|         subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | ||||
|         input("Press Enter to continue after reviewing the HTML output...") | ||||
| 
 | ||||
| 
 | ||||
| def enrich( | ||||
|     model: str, | ||||
|     rpm: int, # Requests per minute | ||||
|     messages_to_process: List[List[Dict[str, str]]], | ||||
|     schema: Dict[str, Any], | ||||
|     chunk_size: int = 100, | ||||
| ): | ||||
|     all_results = [] | ||||
|     num_messages = len(messages_to_process) | ||||
|     if num_messages == 0: | ||||
|         return all_results | ||||
| 
 | ||||
|     num_chunks = math.ceil(num_messages / chunk_size) | ||||
|     logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.") | ||||
| 
 | ||||
|     # Calculate the time that should be allocated per request to respect the RPM limit. | ||||
|     time_per_request = 60.0 / rpm if rpm > 0 else 0 | ||||
| 
 | ||||
|     for i in tqdm(range(num_chunks), desc="Enriching data in chunks"): | ||||
|         chunk_start_time = time.time() | ||||
| 
 | ||||
|         start_index = i * chunk_size | ||||
|         end_index = start_index + chunk_size | ||||
|         message_chunk = messages_to_process[start_index:end_index] | ||||
| 
 | ||||
|         if not message_chunk: | ||||
|             continue | ||||
| 
 | ||||
|         try: | ||||
|             # Send requests for the entire chunk in a batch for better performance. | ||||
|             responses = litellm.batch_completion( | ||||
|                 model=model, | ||||
|                 messages=message_chunk, | ||||
|                 response_format={ | ||||
|                     "type": "json_schema", | ||||
|                     "json_schema": schema, | ||||
|                 }, | ||||
|             ) | ||||
| 
 | ||||
|             # batch_completion returns the response or an exception object for each message. | ||||
|             # We'll replace exceptions with None as expected by the calling functions. | ||||
|             for response in responses: | ||||
|                 if isinstance(response, Exception): | ||||
|                     logger.error(f"API call within batch failed: {response}") | ||||
|                     all_results.append(None) | ||||
|                 else: | ||||
|                     all_results.append(response) | ||||
| 
 | ||||
|         except Exception as e: | ||||
|             # This catches catastrophic failures in batch_completion itself (e.g., auth) | ||||
|             logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}") | ||||
|             all_results.extend([None] * len(message_chunk)) | ||||
| 
 | ||||
|         chunk_end_time = time.time() | ||||
|         elapsed_time = chunk_end_time - chunk_start_time | ||||
| 
 | ||||
|         # To enforce the rate limit, we calculate how long the chunk *should* have taken | ||||
|         # and sleep for the remainder of that time. | ||||
|         if time_per_request > 0: | ||||
|             expected_duration_for_chunk = len(message_chunk) * time_per_request | ||||
|             if elapsed_time < expected_duration_for_chunk: | ||||
|                 sleep_duration = expected_duration_for_chunk - elapsed_time | ||||
|                 logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.") | ||||
|                 time.sleep(sleep_duration) | ||||
| 
 | ||||
|     return all_results | ||||
| 
 | ||||
| def get_contrasting_text_color(bg_color_hex_or_rgba): | ||||
|     if isinstance(bg_color_hex_or_rgba, str): | ||||
|         rgba = mcolors.to_rgba(bg_color_hex_or_rgba) | ||||
|     else: | ||||
|         rgba = bg_color_hex_or_rgba | ||||
|     r, g, b, _ = rgba | ||||
|     luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b | ||||
|     return 'black' if luminance > 0.55 else 'white' | ||||
| 
 | ||||
| 
 | ||||
| def style_plot(): | ||||
|     """ | ||||
|     Applies a consistent and professional style to all plots. | ||||
|     This function sets matplotlib's rcParams for a global effect. | ||||
|     """ | ||||
|     mpl.rcParams.update({ | ||||
|         'figure.facecolor': GRAY['50'], | ||||
|         'figure.edgecolor': 'none', | ||||
|         'figure.figsize': (12, 8), | ||||
|         'figure.dpi': 150, | ||||
| 
 | ||||
|         'axes.facecolor': GRAY['50'], | ||||
|         'axes.edgecolor': GRAY['300'], | ||||
|         'axes.grid': True, | ||||
|         'axes.labelcolor': GRAY['800'], | ||||
|         'axes.titlecolor': GRAY['900'], | ||||
|         'axes.titlesize': 18, | ||||
|         'axes.titleweight': 'bold', | ||||
|         'axes.titlepad': 20, | ||||
|         'axes.labelsize': 14, | ||||
|         'axes.labelweight': 'semibold', | ||||
|         'axes.labelpad': 10, | ||||
|         'axes.spines.top': False, | ||||
|         'axes.spines.right': False, | ||||
|         'axes.spines.left': True, | ||||
|         'axes.spines.bottom': True, | ||||
| 
 | ||||
|         'text.color': GRAY['700'], | ||||
| 
 | ||||
|         'xtick.color': GRAY['600'], | ||||
|         'ytick.color': GRAY['600'], | ||||
|         'xtick.labelsize': 12, | ||||
|         'ytick.labelsize': 12, | ||||
|         'xtick.major.size': 0, | ||||
|         'ytick.major.size': 0, | ||||
|         'xtick.minor.size': 0, | ||||
|         'ytick.minor.size': 0, | ||||
|         'xtick.major.pad': 8, | ||||
|         'ytick.major.pad': 8, | ||||
| 
 | ||||
|         'grid.color': GRAY['200'], | ||||
|         'grid.linestyle': '--', | ||||
|         'grid.linewidth': 1, | ||||
| 
 | ||||
|         'legend.frameon': False, | ||||
|         'legend.fontsize': 12, | ||||
|         'legend.title_fontsize': 14, | ||||
|         'legend.facecolor': 'inherit', | ||||
| 
 | ||||
|         'font.family': 'sans-serif', | ||||
|         'font.sans-serif': ['Inter'], | ||||
|         'font.weight': 'normal', | ||||
| 
 | ||||
|         'lines.linewidth': 2, | ||||
|         'lines.markersize': 6, | ||||
|     }) | ||||
| 
 | ||||
|     # Seaborn specific styles | ||||
|     # Use shades of LIME as the primary color palette. | ||||
|     # Sorting by integer value of keys, and reversed to have darker shades first. | ||||
|     # Excluding very light colors that won't be visible on a light background. | ||||
|     lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]] | ||||
| 
 | ||||
|     sns.set_palette(lime_palette) | ||||
|     sns.set_style("whitegrid", { | ||||
|         'axes.edgecolor': GRAY['300'], | ||||
|         'grid.color': GRAY['200'], | ||||
|         'grid.linestyle': '--', | ||||
|     }) | ||||
|  | @ -16,6 +16,7 @@ dependencies = [ | |||
|     "python-dotenv>=1.1.1", | ||||
|     "requests>=2.32.4", | ||||
|     "rich>=14.0.0", | ||||
|     "scipy>=1.16.0", | ||||
|     "seaborn>=0.13.2", | ||||
| ] | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										31
									
								
								uv.lock
									
										
									
										generated
									
									
									
								
							
							
						
						|  | @ -1120,6 +1120,35 @@ wheels = [ | |||
|     { url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload_time = "2025-07-01T15:55:55.167Z" }, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "scipy" | ||||
| version = "1.16.0" | ||||
| source = { registry = "https://pypi.org/simple" } | ||||
| dependencies = [ | ||||
|     { name = "numpy" }, | ||||
| ] | ||||
| sdist = { url = "https://files.pythonhosted.org/packages/81/18/b06a83f0c5ee8cddbde5e3f3d0bb9b702abfa5136ef6d4620ff67df7eee5/scipy-1.16.0.tar.gz", hash = "sha256:b5ef54021e832869c8cfb03bc3bf20366cbcd426e02a58e8a58d7584dfbb8f62", size = 30581216, upload_time = "2025-06-22T16:27:55.782Z" } | ||||
| wheels = [ | ||||
|     { url = "https://files.pythonhosted.org/packages/46/95/0746417bc24be0c2a7b7563946d61f670a3b491b76adede420e9d173841f/scipy-1.16.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:e9f414cbe9ca289a73e0cc92e33a6a791469b6619c240aa32ee18abdce8ab451", size = 36418162, upload_time = "2025-06-22T16:19:56.3Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/19/5a/914355a74481b8e4bbccf67259bbde171348a3f160b67b4945fbc5f5c1e5/scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bbba55fb97ba3cdef9b1ee973f06b09d518c0c7c66a009c729c7d1592be1935e", size = 28465985, upload_time = "2025-06-22T16:20:01.238Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/58/46/63477fc1246063855969cbefdcee8c648ba4b17f67370bd542ba56368d0b/scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:58e0d4354eacb6004e7aa1cd350e5514bd0270acaa8d5b36c0627bb3bb486974", size = 20737961, upload_time = "2025-06-22T16:20:05.913Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/93/86/0fbb5588b73555e40f9d3d6dde24ee6fac7d8e301a27f6f0cab9d8f66ff2/scipy-1.16.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:75b2094ec975c80efc273567436e16bb794660509c12c6a31eb5c195cbf4b6dc", size = 23377941, upload_time = "2025-06-22T16:20:10.668Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/ca/80/a561f2bf4c2da89fa631b3cbf31d120e21ea95db71fd9ec00cb0247c7a93/scipy-1.16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b65d232157a380fdd11a560e7e21cde34fdb69d65c09cb87f6cc024ee376351", size = 33196703, upload_time = "2025-06-22T16:20:16.097Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/11/6b/3443abcd0707d52e48eb315e33cc669a95e29fc102229919646f5a501171/scipy-1.16.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d8747f7736accd39289943f7fe53a8333be7f15a82eea08e4afe47d79568c32", size = 35083410, upload_time = "2025-06-22T16:20:21.734Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/20/ab/eb0fc00e1e48961f1bd69b7ad7e7266896fe5bad4ead91b5fc6b3561bba4/scipy-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eb9f147a1b8529bb7fec2a85cf4cf42bdfadf9e83535c309a11fdae598c88e8b", size = 35387829, upload_time = "2025-06-22T16:20:27.548Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/57/9e/d6fc64e41fad5d481c029ee5a49eefc17f0b8071d636a02ceee44d4a0de2/scipy-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d2b83c37edbfa837a8923d19c749c1935ad3d41cf196006a24ed44dba2ec4358", size = 37841356, upload_time = "2025-06-22T16:20:35.112Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/7c/a7/4c94bbe91f12126b8bf6709b2471900577b7373a4fd1f431f28ba6f81115/scipy-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:79a3c13d43c95aa80b87328a46031cf52508cf5f4df2767602c984ed1d3c6bbe", size = 38403710, upload_time = "2025-06-22T16:21:54.473Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/47/20/965da8497f6226e8fa90ad3447b82ed0e28d942532e92dd8b91b43f100d4/scipy-1.16.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:f91b87e1689f0370690e8470916fe1b2308e5b2061317ff76977c8f836452a47", size = 36813833, upload_time = "2025-06-22T16:20:43.925Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/28/f4/197580c3dac2d234e948806e164601c2df6f0078ed9f5ad4a62685b7c331/scipy-1.16.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:88a6ca658fb94640079e7a50b2ad3b67e33ef0f40e70bdb7dc22017dae73ac08", size = 28974431, upload_time = "2025-06-22T16:20:51.302Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/8a/fc/e18b8550048d9224426e76906694c60028dbdb65d28b1372b5503914b89d/scipy-1.16.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ae902626972f1bd7e4e86f58fd72322d7f4ec7b0cfc17b15d4b7006efc385176", size = 21246454, upload_time = "2025-06-22T16:20:57.276Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/8c/48/07b97d167e0d6a324bfd7484cd0c209cc27338b67e5deadae578cf48e809/scipy-1.16.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:8cb824c1fc75ef29893bc32b3ddd7b11cf9ab13c1127fe26413a05953b8c32ed", size = 23772979, upload_time = "2025-06-22T16:21:03.363Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/4c/4f/9efbd3f70baf9582edf271db3002b7882c875ddd37dc97f0f675ad68679f/scipy-1.16.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:de2db7250ff6514366a9709c2cba35cb6d08498e961cba20d7cff98a7ee88938", size = 33341972, upload_time = "2025-06-22T16:21:11.14Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/3f/dc/9e496a3c5dbe24e76ee24525155ab7f659c20180bab058ef2c5fa7d9119c/scipy-1.16.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e85800274edf4db8dd2e4e93034f92d1b05c9421220e7ded9988b16976f849c1", size = 35185476, upload_time = "2025-06-22T16:21:19.156Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/ce/b3/21001cff985a122ba434c33f2c9d7d1dc3b669827e94f4fc4e1fe8b9dfd8/scipy-1.16.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4f720300a3024c237ace1cb11f9a84c38beb19616ba7c4cdcd771047a10a1706", size = 35570990, upload_time = "2025-06-22T16:21:27.797Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/e5/d3/7ba42647d6709251cdf97043d0c107e0317e152fa2f76873b656b509ff55/scipy-1.16.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:aad603e9339ddb676409b104c48a027e9916ce0d2838830691f39552b38a352e", size = 37950262, upload_time = "2025-06-22T16:21:36.976Z" }, | ||||
|     { url = "https://files.pythonhosted.org/packages/eb/c4/231cac7a8385394ebbbb4f1ca662203e9d8c332825ab4f36ffc3ead09a42/scipy-1.16.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f56296fefca67ba605fd74d12f7bd23636267731a72cb3947963e76b8c0a25db", size = 38515076, upload_time = "2025-06-22T16:21:45.694Z" }, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "seaborn" | ||||
| version = "0.13.2" | ||||
|  | @ -1168,6 +1197,7 @@ dependencies = [ | |||
|     { name = "python-dotenv" }, | ||||
|     { name = "requests" }, | ||||
|     { name = "rich" }, | ||||
|     { name = "scipy" }, | ||||
|     { name = "seaborn" }, | ||||
| ] | ||||
| 
 | ||||
|  | @ -1184,6 +1214,7 @@ requires-dist = [ | |||
|     { name = "python-dotenv", specifier = ">=1.1.1" }, | ||||
|     { name = "requests", specifier = ">=2.32.4" }, | ||||
|     { name = "rich", specifier = ">=14.0.0" }, | ||||
|     { name = "scipy", specifier = ">=1.16.0" }, | ||||
|     { name = "seaborn", specifier = ">=0.13.2" }, | ||||
| ] | ||||
| 
 | ||||
|  |  | |||
 Félix Dorn
						Félix Dorn