progress
This commit is contained in:
		
							parent
							
								
									2da206d368
								
							
						
					
					
						commit
						b7c94590f9
					
				
					 14 changed files with 2200 additions and 13 deletions
				
			
		
							
								
								
									
										507
									
								
								old/add_task_estimates.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										507
									
								
								old/add_task_estimates.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,507 @@ | ||||||
|  | import pandas as pd | ||||||
|  | import litellm | ||||||
|  | import dotenv | ||||||
|  | import os | ||||||
|  | import time | ||||||
|  | import json | ||||||
|  | import math | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | # --- Configuration --- | ||||||
|  | MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output | ||||||
|  | RATE_LIMIT = 5000  # Requests per minute | ||||||
|  | CHUNK_SIZE = 300 | ||||||
|  | SECONDS_PER_MINUTE = 60 | ||||||
|  | FILENAME = ( | ||||||
|  |     "tasks_with_estimates.csv"  # This CSV should contain the tasks to be processed | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | # --- Prompts and Schema --- | ||||||
|  | SYSTEM_PROMPT = """ | ||||||
|  | You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision. | ||||||
|  | 
 | ||||||
|  | Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. | ||||||
|  | 
 | ||||||
|  | Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year. | ||||||
|  | """.strip() | ||||||
|  | 
 | ||||||
|  | USER_MESSAGE_TEMPLATE = """ | ||||||
|  | Please estimate the time range for the following remote task: | ||||||
|  | 
 | ||||||
|  | **Task Description:** {task} | ||||||
|  | **Relevant activies for the task:** | ||||||
|  | {dwas} | ||||||
|  | 
 | ||||||
|  | **Occupation Category:** {occupation_title} | ||||||
|  | **Occupation Description:** {occupation_description} | ||||||
|  | 
 | ||||||
|  | Consider the complexity and the typical steps involved. | ||||||
|  | """.strip() | ||||||
|  | 
 | ||||||
|  | ALLOWED_UNITS = [ | ||||||
|  |     "minute", | ||||||
|  |     "hour", | ||||||
|  |     "day", | ||||||
|  |     "week", | ||||||
|  |     "month", | ||||||
|  |     "trimester", | ||||||
|  |     "semester", | ||||||
|  |     "year", | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | SCHEMA_FOR_VALIDATION = { | ||||||
|  |     "name": "estimate_time", | ||||||
|  |     "strict": True,  # Enforce schema adherence | ||||||
|  |     "schema": { | ||||||
|  |         "type": "object", | ||||||
|  |         "properties": { | ||||||
|  |             "lower_bound_estimate": { | ||||||
|  |                 "type": "object", | ||||||
|  |                 "properties": { | ||||||
|  |                     "quantity": { | ||||||
|  |                         "type": "number", | ||||||
|  |                         "description": "The numerical value for the lower bound of the estimate.", | ||||||
|  |                     }, | ||||||
|  |                     "unit": { | ||||||
|  |                         "type": "string", | ||||||
|  |                         "enum": ALLOWED_UNITS, | ||||||
|  |                         "description": "The unit of time for the lower bound.", | ||||||
|  |                     }, | ||||||
|  |                 }, | ||||||
|  |                 "required": ["quantity", "unit"], | ||||||
|  |                 "additionalProperties": False, | ||||||
|  |             }, | ||||||
|  |             "upper_bound_estimate": { | ||||||
|  |                 "type": "object", | ||||||
|  |                 "properties": { | ||||||
|  |                     "quantity": { | ||||||
|  |                         "type": "number", | ||||||
|  |                         "description": "The numerical value for the upper bound of the estimate.", | ||||||
|  |                     }, | ||||||
|  |                     "unit": { | ||||||
|  |                         "type": "string", | ||||||
|  |                         "enum": ALLOWED_UNITS, | ||||||
|  |                         "description": "The unit of time for the upper bound.", | ||||||
|  |                     }, | ||||||
|  |                 }, | ||||||
|  |                 "required": ["quantity", "unit"], | ||||||
|  |                 "additionalProperties": False, | ||||||
|  |             }, | ||||||
|  |         }, | ||||||
|  |         "required": ["lower_bound_estimate", "upper_bound_estimate"], | ||||||
|  |         "additionalProperties": False, | ||||||
|  |     }, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def save_dataframe(df_to_save, filename): | ||||||
|  | 
 | ||||||
|  |     """Saves the DataFrame to the specified CSV file using atomic write.""" | ||||||
|  |     try: | ||||||
|  |         temp_filename = filename + ".tmp" | ||||||
|  |         df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) | ||||||
|  |         os.replace(temp_filename, filename) | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"--- Error saving DataFrame to {filename}: {e} ---") | ||||||
|  |         if os.path.exists(temp_filename): | ||||||
|  |             try: | ||||||
|  |                 os.remove(temp_filename) | ||||||
|  |             except Exception as remove_err: | ||||||
|  |                 print( | ||||||
|  |                     f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  | def create_task_estimates(): | ||||||
|  |     try: | ||||||
|  |         # Read the CSV | ||||||
|  |         if os.path.exists(FILENAME): | ||||||
|  |             df = pd.read_csv(FILENAME, encoding="utf-8-sig") | ||||||
|  |             print(f"Successfully read {len(df)} rows from {FILENAME}.") | ||||||
|  | 
 | ||||||
|  |             estimate_columns_spec = { | ||||||
|  |                 "lb_estimate_qty": float, | ||||||
|  |                 "lb_estimate_unit": object, | ||||||
|  |                 "ub_estimate_qty": float, | ||||||
|  |                 "ub_estimate_unit": object, | ||||||
|  |             } | ||||||
|  |             save_needed = False | ||||||
|  | 
 | ||||||
|  |             for col_name, target_dtype in estimate_columns_spec.items(): | ||||||
|  |                 if col_name not in df.columns: | ||||||
|  |                     # Initialize with a type-compatible missing value | ||||||
|  |                     if target_dtype == float: | ||||||
|  |                         df[col_name] = np.nan | ||||||
|  |                     else:  # object | ||||||
|  |                         df[col_name] = pd.NA | ||||||
|  |                     df[col_name] = df[col_name].astype(target_dtype)  # Enforce dtype | ||||||
|  |                     print(f"Added '{col_name}' column as {df[col_name].dtype}.") | ||||||
|  |                     save_needed = True | ||||||
|  |                 else: | ||||||
|  |                     # Column exists, ensure correct dtype | ||||||
|  |                     current_pd_dtype = df[col_name].dtype | ||||||
|  |                     expected_pd_dtype = pd.Series(dtype=target_dtype).dtype | ||||||
|  | 
 | ||||||
|  |                     if current_pd_dtype != expected_pd_dtype: | ||||||
|  |                         try: | ||||||
|  |                             if target_dtype == float: | ||||||
|  |                                 df[col_name] = pd.to_numeric(df[col_name], errors="coerce") | ||||||
|  |                             else:  # object | ||||||
|  |                                 df[col_name] = df[col_name].astype(object) | ||||||
|  |                             print( | ||||||
|  |                                 f"Corrected dtype of '{col_name}' to {df[col_name].dtype}." | ||||||
|  |                             ) | ||||||
|  |                             save_needed = True | ||||||
|  |                         except Exception as e: | ||||||
|  |                             print( | ||||||
|  |                                 f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}" | ||||||
|  |                             ) | ||||||
|  | 
 | ||||||
|  |                 # Standardize missing values (e.g., empty strings to NA/NaN) | ||||||
|  |                 # Replace common missing placeholders with pd.NA first | ||||||
|  |                 df[col_name].replace(["", None, ""], pd.NA, inplace=True) | ||||||
|  |                 if target_dtype == float: | ||||||
|  |                     # For float columns, ensure they are numeric and use np.nan after replacement | ||||||
|  |                     df[col_name] = pd.to_numeric(df[col_name], errors="coerce") | ||||||
|  | 
 | ||||||
|  |             if save_needed: | ||||||
|  |                 print(f"Saving {FILENAME} after adding/adjusting estimate columns.") | ||||||
|  |                 save_dataframe(df, FILENAME) | ||||||
|  |         else: | ||||||
|  |             print( | ||||||
|  |                 f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." | ||||||
|  |             ) | ||||||
|  |             exit() | ||||||
|  |     except FileNotFoundError: | ||||||
|  |         print( | ||||||
|  |             f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." | ||||||
|  |         ) | ||||||
|  |         exit() | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"Error reading or initializing {FILENAME}: {e}") | ||||||
|  |         exit() | ||||||
|  | 
 | ||||||
|  |     # --- Identify Rows to Process --- | ||||||
|  |     # We'll check for NaN in one of the primary quantity columns. | ||||||
|  |     unprocessed_mask = df["lb_estimate_qty"].isna() | ||||||
|  |     if unprocessed_mask.any(): | ||||||
|  |         start_index = unprocessed_mask.idxmax()  # Finds the index of the first True value | ||||||
|  |         print(f"Resuming processing. First unprocessed row found at index {start_index}.") | ||||||
|  |         df_to_process = df.loc[unprocessed_mask].copy() | ||||||
|  |         original_indices = df_to_process.index  # Keep track of original indices | ||||||
|  |     else: | ||||||
|  |         print( | ||||||
|  |             "All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting." | ||||||
|  |         ) | ||||||
|  |         exit() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # --- Prepare messages for batch completion (only for rows needing processing) --- | ||||||
|  |     messages_list = [] | ||||||
|  |     skipped_rows_indices = [] | ||||||
|  |     valid_original_indices = [] | ||||||
|  | 
 | ||||||
|  |     if not df_to_process.empty: | ||||||
|  |         required_cols = ["task", "occupation_title", "occupation_description", "dwas"] | ||||||
|  |         print( | ||||||
|  |             f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..." | ||||||
|  |         ) | ||||||
|  |         print(f"Checking for required columns: {required_cols}") | ||||||
|  | 
 | ||||||
|  |         for index, row in df_to_process.iterrows(): | ||||||
|  |             missing_or_empty = [] | ||||||
|  |             for col in required_cols: | ||||||
|  |                 if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "": | ||||||
|  |                     missing_or_empty.append(col) | ||||||
|  | 
 | ||||||
|  |             if missing_or_empty: | ||||||
|  |                 print( | ||||||
|  |                     f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}." | ||||||
|  |                 ) | ||||||
|  |                 skipped_rows_indices.append(index) | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 user_message = USER_MESSAGE_TEMPLATE.format( | ||||||
|  |                     task=row["task"], | ||||||
|  |                     occupation_title=row["occupation_title"], | ||||||
|  |                     occupation_description=row["occupation_description"], | ||||||
|  |                     dwas=row["dwas"], | ||||||
|  |                 ) | ||||||
|  |             except KeyError as e: | ||||||
|  |                 print( | ||||||
|  |                     f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns." | ||||||
|  |                 ) | ||||||
|  |                 skipped_rows_indices.append(index) | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             messages_for_row = [ | ||||||
|  |                 {"role": "system", "content": SYSTEM_PROMPT}, | ||||||
|  |                 {"role": "user", "content": user_message}, | ||||||
|  |             ] | ||||||
|  |             messages_list.append(messages_for_row) | ||||||
|  |             valid_original_indices.append(index)  # This is the original DataFrame index | ||||||
|  | 
 | ||||||
|  |         print( | ||||||
|  |             f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)." | ||||||
|  |         ) | ||||||
|  |         if not messages_list: | ||||||
|  |             print("No valid rows found to process after checking required data. Exiting.") | ||||||
|  |             exit() | ||||||
|  |     else: | ||||||
|  |         print( | ||||||
|  |             "No rows found needing processing (df_to_process is empty)." | ||||||
|  |         )  # Should have been caught by earlier check | ||||||
|  |         exit() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # --- Call batch_completion in chunks with rate limiting and periodic saving --- | ||||||
|  |     total_messages_to_send = len(messages_list) | ||||||
|  |     num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE) | ||||||
|  | 
 | ||||||
|  |     print( | ||||||
|  |         f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..." | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     overall_start_time = time.time() | ||||||
|  |     processed_count_total = 0 | ||||||
|  | 
 | ||||||
|  |     for i in range(num_chunks): | ||||||
|  |         chunk_start_message_index = i * CHUNK_SIZE | ||||||
|  |         chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send) | ||||||
|  |         message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] | ||||||
|  |         # Get corresponding original DataFrame indices for this chunk | ||||||
|  |         chunk_original_indices = valid_original_indices[ | ||||||
|  |             chunk_start_message_index:chunk_end_message_index | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |         if not message_chunk: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A" | ||||||
|  |         max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A" | ||||||
|  |         print( | ||||||
|  |             f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." | ||||||
|  |             f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}" | ||||||
|  |         ) | ||||||
|  |         chunk_start_time = time.time() | ||||||
|  |         responses = [] | ||||||
|  |         try: | ||||||
|  |             print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...") | ||||||
|  |             responses = litellm.batch_completion( | ||||||
|  |                 model=MODEL, | ||||||
|  |                 messages=message_chunk, | ||||||
|  |                 response_format={ | ||||||
|  |                     "type": "json_schema", | ||||||
|  |                     "json_schema": SCHEMA_FOR_VALIDATION, | ||||||
|  |                 }, | ||||||
|  |                 num_retries=3, | ||||||
|  |                 # request_timeout=60 # Optional: uncomment if needed | ||||||
|  |             ) | ||||||
|  |             print(f"Chunk {i + 1} API call completed.") | ||||||
|  | 
 | ||||||
|  |         except Exception as e: | ||||||
|  |             print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") | ||||||
|  |             responses = [None] * len( | ||||||
|  |                 message_chunk | ||||||
|  |             )  # Ensure responses list matches message_chunk length for processing loop | ||||||
|  | 
 | ||||||
|  |         # --- Process responses for the current chunk --- | ||||||
|  |         chunk_updates = {}  # To store {original_df_index: {qty/unit data}} | ||||||
|  |         successful_in_chunk = 0 | ||||||
|  |         failed_in_chunk = 0 | ||||||
|  | 
 | ||||||
|  |         if responses and len(responses) == len(message_chunk): | ||||||
|  |             for j, response in enumerate(responses): | ||||||
|  |                 original_df_index = chunk_original_indices[j] | ||||||
|  | 
 | ||||||
|  |                 # Initialize values for this item | ||||||
|  |                 lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None | ||||||
|  |                 content_str = None | ||||||
|  | 
 | ||||||
|  |                 if response is None: | ||||||
|  |                     print( | ||||||
|  |                         f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)." | ||||||
|  |                     ) | ||||||
|  |                     failed_in_chunk += 1 | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 try: | ||||||
|  |                     if ( | ||||||
|  |                         response.choices | ||||||
|  |                         and response.choices[0].message | ||||||
|  |                         and response.choices[0].message.content | ||||||
|  |                     ): | ||||||
|  |                         content_str = response.choices[0].message.content | ||||||
|  |                         estimate_data = json.loads(content_str)  # Can raise JSONDecodeError | ||||||
|  | 
 | ||||||
|  |                         lower_bound_dict = estimate_data.get("lower_bound_estimate") | ||||||
|  |                         upper_bound_dict = estimate_data.get("upper_bound_estimate") | ||||||
|  | 
 | ||||||
|  |                         valid_response_structure = isinstance( | ||||||
|  |                             lower_bound_dict, dict | ||||||
|  |                         ) and isinstance(upper_bound_dict, dict) | ||||||
|  | 
 | ||||||
|  |                         if valid_response_structure: | ||||||
|  |                             lb_qty_raw = lower_bound_dict.get("quantity") | ||||||
|  |                             lb_unit_raw = lower_bound_dict.get("unit") | ||||||
|  |                             ub_qty_raw = upper_bound_dict.get("quantity") | ||||||
|  |                             ub_unit_raw = upper_bound_dict.get("unit") | ||||||
|  | 
 | ||||||
|  |                             is_valid_item = True | ||||||
|  |                             # Validate LB Qty | ||||||
|  |                             if ( | ||||||
|  |                                 not isinstance(lb_qty_raw, (int, float)) | ||||||
|  |                                 or math.isnan(float(lb_qty_raw)) | ||||||
|  |                                 or float(lb_qty_raw) < 0 | ||||||
|  |                             ): | ||||||
|  |                                 print( | ||||||
|  |                                     f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}" | ||||||
|  |                                 ) | ||||||
|  |                                 is_valid_item = False | ||||||
|  |                             else: | ||||||
|  |                                 lb_qty_val = float(lb_qty_raw) | ||||||
|  | 
 | ||||||
|  |                             # Validate UB Qty | ||||||
|  |                             if ( | ||||||
|  |                                 not isinstance(ub_qty_raw, (int, float)) | ||||||
|  |                                 or math.isnan(float(ub_qty_raw)) | ||||||
|  |                                 or float(ub_qty_raw) < 0 | ||||||
|  |                             ): | ||||||
|  |                                 print( | ||||||
|  |                                     f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}" | ||||||
|  |                                 ) | ||||||
|  |                                 is_valid_item = False | ||||||
|  |                             else: | ||||||
|  |                                 ub_qty_val = float(ub_qty_raw) | ||||||
|  | 
 | ||||||
|  |                             # Validate Units | ||||||
|  |                             if lb_unit_raw not in ALLOWED_UNITS: | ||||||
|  |                                 print( | ||||||
|  |                                     f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'" | ||||||
|  |                                 ) | ||||||
|  |                                 is_valid_item = False | ||||||
|  |                             else: | ||||||
|  |                                 lb_unit_val = lb_unit_raw | ||||||
|  | 
 | ||||||
|  |                             if ub_unit_raw not in ALLOWED_UNITS: | ||||||
|  |                                 print( | ||||||
|  |                                     f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'" | ||||||
|  |                                 ) | ||||||
|  |                                 is_valid_item = False | ||||||
|  |                             else: | ||||||
|  |                                 ub_unit_val = ub_unit_raw | ||||||
|  | 
 | ||||||
|  |                             if is_valid_item: | ||||||
|  |                                 successful_in_chunk += 1 | ||||||
|  |                                 chunk_updates[original_df_index] = { | ||||||
|  |                                     "lb_estimate_qty": lb_qty_val, | ||||||
|  |                                     "lb_estimate_unit": lb_unit_val, | ||||||
|  |                                     "ub_estimate_qty": ub_qty_val, | ||||||
|  |                                     "ub_estimate_unit": ub_unit_val, | ||||||
|  |                                 } | ||||||
|  |                             else: | ||||||
|  |                                 failed_in_chunk += ( | ||||||
|  |                                     1  # Values remain None if not fully valid | ||||||
|  |                                 ) | ||||||
|  |                         else: | ||||||
|  |                             print( | ||||||
|  |                                 f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'" | ||||||
|  |                             ) | ||||||
|  |                             failed_in_chunk += 1 | ||||||
|  |                     else: | ||||||
|  |                         finish_reason = ( | ||||||
|  |                             response.choices[0].finish_reason | ||||||
|  |                             if (response.choices and response.choices[0].finish_reason) | ||||||
|  |                             else "unknown" | ||||||
|  |                         ) | ||||||
|  |                         error_message = ( | ||||||
|  |                             response.choices[0].message.content | ||||||
|  |                             if ( | ||||||
|  |                                 response.choices | ||||||
|  |                                 and response.choices[0].message | ||||||
|  |                                 and response.choices[0].message.content | ||||||
|  |                             ) | ||||||
|  |                             else "No content in message." | ||||||
|  |                         ) | ||||||
|  |                         print( | ||||||
|  |                             f"Warning: Received non-standard or empty response content for original index {original_df_index}. " | ||||||
|  |                             f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" | ||||||
|  |                         ) | ||||||
|  |                         failed_in_chunk += 1 | ||||||
|  | 
 | ||||||
|  |                 except json.JSONDecodeError: | ||||||
|  |                     print( | ||||||
|  |                         f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'" | ||||||
|  |                     ) | ||||||
|  |                     failed_in_chunk += 1 | ||||||
|  |                 except AttributeError as ae: | ||||||
|  |                     print( | ||||||
|  |                         f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}" | ||||||
|  |                     ) | ||||||
|  |                     failed_in_chunk += 1 | ||||||
|  |                 except Exception as e: | ||||||
|  |                     print( | ||||||
|  |                         f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}" | ||||||
|  |                     ) | ||||||
|  |                     failed_in_chunk += 1 | ||||||
|  |         else: | ||||||
|  |             print( | ||||||
|  |                 f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) " | ||||||
|  |                 f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed." | ||||||
|  |             ) | ||||||
|  |             failed_in_chunk = len( | ||||||
|  |                 message_chunk | ||||||
|  |             )  # All items in this chunk are considered failed if response array is problematic | ||||||
|  | 
 | ||||||
|  |         print( | ||||||
|  |             f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}" | ||||||
|  |         ) | ||||||
|  |         processed_count_total += successful_in_chunk | ||||||
|  | 
 | ||||||
|  |         # --- Update Main DataFrame and Save Periodically --- | ||||||
|  |         if chunk_updates: | ||||||
|  |             print( | ||||||
|  |                 f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..." | ||||||
|  |             ) | ||||||
|  |             for idx, estimates in chunk_updates.items(): | ||||||
|  |                 if idx in df.index: | ||||||
|  |                     df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"] | ||||||
|  |                     df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"] | ||||||
|  |                     df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"] | ||||||
|  |                     df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"] | ||||||
|  | 
 | ||||||
|  |             print(f"Saving progress to {FILENAME}...") | ||||||
|  |             save_dataframe(df, FILENAME) | ||||||
|  |         else: | ||||||
|  |             print(f"No successful estimates obtained in chunk {i + 1} to save.") | ||||||
|  | 
 | ||||||
|  |         # --- Rate Limiting Pause --- | ||||||
|  |         chunk_end_time = time.time() | ||||||
|  |         chunk_duration = chunk_end_time - chunk_start_time | ||||||
|  |         print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.") | ||||||
|  | 
 | ||||||
|  |         if i < num_chunks - 1:  # No pause after the last chunk | ||||||
|  |             # Calculate ideal time per request based on rate limit | ||||||
|  |             time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 | ||||||
|  |             # Calculate minimum duration this chunk should have taken to respect rate limit | ||||||
|  |             min_chunk_duration_for_rate = len(message_chunk) * time_per_request | ||||||
|  |             # Calculate pause needed | ||||||
|  |             pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) | ||||||
|  | 
 | ||||||
|  |             if pause_needed > 0: | ||||||
|  |                 print( | ||||||
|  |                     f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." | ||||||
|  |                 ) | ||||||
|  |                 time.sleep(pause_needed) | ||||||
|  | 
 | ||||||
|  |     overall_end_time = time.time() | ||||||
|  |     total_duration_minutes = (overall_end_time - overall_start_time) / 60 | ||||||
|  |     print( | ||||||
|  |         f"\nBatch completion finished." | ||||||
|  |         f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes." | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     print(f"Performing final save to {FILENAME}...") | ||||||
|  |     save_dataframe(df, FILENAME) | ||||||
|  | 
 | ||||||
|  |     print("\nScript finished.") | ||||||
							
								
								
									
										521
									
								
								old/analysis.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										521
									
								
								old/analysis.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,521 @@ | ||||||
|  | import os | ||||||
|  | import litellm | ||||||
|  | import sqlite3 | ||||||
|  | import numpy as np | ||||||
|  | import pandas as pd | ||||||
|  | from google.colab import userdata, files | ||||||
|  | import seaborn as sns | ||||||
|  | import matplotlib.pyplot as plt | ||||||
|  | import matplotlib as mpl | ||||||
|  | 
 | ||||||
|  | os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') | ||||||
|  | os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY') | ||||||
|  | 
 | ||||||
|  | occupation_major_codes = { | ||||||
|  |     '11': 'Management', | ||||||
|  |     '13': 'Business and Financial Operations', | ||||||
|  |     '15': 'Computer and Mathematical Occupations', | ||||||
|  |     '17': 'Architecture and Engineering', | ||||||
|  |     '19': 'Life, Physical, and Social Science', | ||||||
|  |     '21': 'Community and Social Services', | ||||||
|  |     '23': 'Legal', | ||||||
|  |     '25': 'Education, Training, and Library', | ||||||
|  |     '27': 'Arts, Design, Entertainment, Sports, and Media', | ||||||
|  |     '29': 'Healthcare Practitioners and Technical', | ||||||
|  |     '31': 'Healthcare Support', | ||||||
|  |     '33': 'Protective Service', | ||||||
|  |     '35': 'Food Preparation and Serving Related', | ||||||
|  |     '37': 'Building and Grounds Cleaning and Maintenance', | ||||||
|  |     '39': 'Personal Care and Service', | ||||||
|  |     '41': 'Sales and Related', | ||||||
|  |     '43': 'Office and Administrative Support', | ||||||
|  |     '45': 'Farming, Fishing, and Forestry', | ||||||
|  |     '47': 'Construction and Extraction', | ||||||
|  |     '49': 'Installation, Maintenance, and Repair', | ||||||
|  |     '51': 'Production', | ||||||
|  |     '53': 'Transportation and Material Moving', | ||||||
|  |     '55': 'Military Specific' | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | gray   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', | ||||||
|  |                    '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', | ||||||
|  |                    '600':'#475569','700':'#334155','800':'#1e293b', | ||||||
|  |                    '900':'#0f172a','950':'#020617'} | ||||||
|  | lime            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', | ||||||
|  |                    '300': '#bbf451','400': '#9ae600','500': '#83cd00', | ||||||
|  |                    '600': '#64a400','700': '#497d00','800': '#3c6300', | ||||||
|  |                    '900': '#35530e','950': '#192e03'} | ||||||
|  | 
 | ||||||
|  | mpl.rcParams.update({ | ||||||
|  |     'figure.facecolor' : gray['50'], | ||||||
|  |     'axes.facecolor'   : gray['50'], | ||||||
|  |     'axes.edgecolor'   : gray['100'], | ||||||
|  |     'axes.labelcolor'  : gray['700'], | ||||||
|  |     'xtick.color'      : gray['700'], | ||||||
|  |     'ytick.color'      : gray['700'], | ||||||
|  |     'font.family'      : 'Inter',  # falls back to DejaVu if Inter not present | ||||||
|  |     'font.size'        : 11, | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | sns.set_style("white")         # keep minimal axes, we will remove default grid | ||||||
|  | sns.set_context("notebook") | ||||||
|  | 
 | ||||||
|  | def prepare_tasks(): | ||||||
|  | 
 | ||||||
|  |     # Run uv run ./enrich_task_ratings.py | ||||||
|  |     df_tasks = pd.read_json("task_ratings_enriched.json") | ||||||
|  | 
 | ||||||
|  |     # Run uv run classify_estimateability_of_tasks.py | ||||||
|  |     df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first') | ||||||
|  | 
 | ||||||
|  |     # df_tasks now has a remote_status column which contains either "remote" or "not remote" | ||||||
|  |     df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left') | ||||||
|  |     df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) | ||||||
|  | 
 | ||||||
|  |     # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT" | ||||||
|  |     df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left') | ||||||
|  | 
 | ||||||
|  |     df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy() | ||||||
|  | 
 | ||||||
|  |     df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2] | ||||||
|  | 
 | ||||||
|  |     df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy() | ||||||
|  | 
 | ||||||
|  |     # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv | ||||||
|  | 
 | ||||||
|  | def preprocessing_time_estimates(): | ||||||
|  |     df = pd.read_csv("tasks_with_estimates.csv") | ||||||
|  | 
 | ||||||
|  |     df = df[df['importance_average'] > 3].copy() | ||||||
|  | 
 | ||||||
|  |     # The embeddings comes from running `uv run ./embed_task_description.py` | ||||||
|  |     # Columns: ['embedding_id', 'task', 'embedding_vector'] | ||||||
|  |     # These contain embedding for UNIQUE tasks | ||||||
|  |     df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy() | ||||||
|  | 
 | ||||||
|  |     df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left') | ||||||
|  |     df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left') | ||||||
|  | 
 | ||||||
|  |     df['onetsoc_major'] = df['onetsoc_code'].str[:2] | ||||||
|  | 
 | ||||||
|  |     def convert_to_minutes(qty, unit): | ||||||
|  |         """Converts a quantity in a given unit to minutes.""" | ||||||
|  |         return qty * { | ||||||
|  |             "minute": 1, | ||||||
|  |             "hour": 60, | ||||||
|  |             "day": 60 * 24, | ||||||
|  |             "week": 60 * 24 * 7, | ||||||
|  |             "month": 60 * 24 * 30, | ||||||
|  |             "trimester": 60 * 24 * 90, | ||||||
|  |             "semester": 60 * 24 * 180, | ||||||
|  |             "year": 60 * 24 * 365, | ||||||
|  |         }[unit] | ||||||
|  | 
 | ||||||
|  |     df['lb_estimate_in_minutes'] = df.apply( | ||||||
|  |         lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1 | ||||||
|  |     ) | ||||||
|  |     df['ub_estimate_in_minutes'] = df.apply( | ||||||
|  |         lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1 | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes | ||||||
|  |     df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes | ||||||
|  |     df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2 | ||||||
|  | 
 | ||||||
|  |     atomic_tasks = df[df['estimateable'] == 'ATOMIC'] | ||||||
|  |     ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT'] | ||||||
|  | 
 | ||||||
|  |     with pd.option_context('display.max_columns', None): | ||||||
|  |       display(df) | ||||||
|  | 
 | ||||||
|  |     # Check for empty estimates | ||||||
|  |     if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0: | ||||||
|  |         print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum()) | ||||||
|  | 
 | ||||||
|  |     if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0: | ||||||
|  |         print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum()) | ||||||
|  | 
 | ||||||
|  |     # Check for impossible bounds | ||||||
|  |     impossible_bounds = atomic_tasks[ | ||||||
|  |         (atomic_tasks['lb_estimate_in_minutes'] <= 0) | | ||||||
|  |         (atomic_tasks['ub_estimate_in_minutes'] <= 0) | | ||||||
|  |         (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes']) | ||||||
|  |     ] | ||||||
|  |     if not impossible_bounds.empty: | ||||||
|  |         print(f"Error: Found rows with impossible bounds.") | ||||||
|  |         with pd.option_context('display.max_colwidth', None): | ||||||
|  |         display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']]) | ||||||
|  | 
 | ||||||
|  |     #with pd.option_context('display.max_colwidth', None): | ||||||
|  |         #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']]) | ||||||
|  | 
 | ||||||
|  | def cell1(): | ||||||
|  |     sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True) | ||||||
|  | 
 | ||||||
|  | def cell2(): | ||||||
|  |     plt.figure(figsize=(14,10)) | ||||||
|  |     sns.boxplot( | ||||||
|  |         data=atomic_tasks, | ||||||
|  |         x='onetsoc_major',           # 11 = Management, 15 = Computer/Math, … | ||||||
|  |         y='estimate_range', | ||||||
|  |         showfliers=False | ||||||
|  |     ) | ||||||
|  |     plt.yscale('log')                # long tail => log scale | ||||||
|  |     plt.xlabel('Occupation') | ||||||
|  |     plt.ylabel('Range (upper-lower, minutes)') | ||||||
|  |     plt.title('Spread of time-range estimates per occupation') | ||||||
|  | 
 | ||||||
|  |     ax = plt.gca() | ||||||
|  |     ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right') | ||||||
|  | 
 | ||||||
|  | def cell3(): | ||||||
|  |     plt.figure(figsize=(10, 10)) | ||||||
|  |     ax = sns.scatterplot( | ||||||
|  |             data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}),  # Replace codes with labels | ||||||
|  |             x='lb_estimate_in_minutes', y='ub_estimate_in_minutes', | ||||||
|  |             alpha=0.2, edgecolor=None, hue="onetsoc_major"  # Use the labeled column for hue | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # 45° reference | ||||||
|  |     lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max()) | ||||||
|  |     ax.plot(lims, lims, color='black', linestyle='--', linewidth=1) | ||||||
|  | 
 | ||||||
|  |     # optional helper lines: 2× and 10×, 100× ratios | ||||||
|  |     for k in [2,10, 100]: | ||||||
|  |         ax.plot(lims, [k*l for l in lims], | ||||||
|  |                 linestyle=':', color='grey', linewidth=1) | ||||||
|  | 
 | ||||||
|  |     ax.set(xscale='log', yscale='log') | ||||||
|  |     ax.set_xlabel('Lower-bound (min, log scale)') | ||||||
|  |     ax.set_ylabel('Upper-bound (min, log scale)') | ||||||
|  |     ax.set_title('Lower vs upper estimates for all tasks') | ||||||
|  | 
 | ||||||
|  |     # Place the legend outside the plot | ||||||
|  |     ax.legend(bbox_to_anchor=(1, 1), loc='upper left') | ||||||
|  | 
 | ||||||
|  | def cell4(): | ||||||
|  |     plt.figure(figsize=(8,4)) | ||||||
|  |     sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()), | ||||||
|  |                 bins=60, kde=True) | ||||||
|  |     plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×') | ||||||
|  |     plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×') | ||||||
|  |     plt.axvline(0, color='black', ls='-', lw=1)          # ub = lb | ||||||
|  |     plt.xlabel('log₁₀(upper / lower)') | ||||||
|  |     plt.ylabel('Count') | ||||||
|  |     plt.title('Distribution of upper:lower ratio') | ||||||
|  |     plt.legend() | ||||||
|  |     plt.tight_layout() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def cell5(): | ||||||
|  |     # 1. Bin lower bounds into quartiles (Q1–Q4) | ||||||
|  |     atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes, | ||||||
|  |                         q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # 3. Aggregate: median (or mean) ratio per cell | ||||||
|  |     pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q', | ||||||
|  |                         values='estimate_ratio', aggfunc='median') | ||||||
|  | 
 | ||||||
|  |     # Map the index (onetsoc_major codes) to their corresponding labels | ||||||
|  |     pivot.index = pivot.index.map(occupation_major_codes) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # 4. Visualise | ||||||
|  |     plt.figure(figsize=(10,8)) | ||||||
|  |     sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f', | ||||||
|  |                 cbar_kws={'label':'Median upper/lower ratio'}) | ||||||
|  |     plt.xlabel('Lower-bound quartile') | ||||||
|  |     plt.ylabel('Occupation (major group)') | ||||||
|  |     plt.title('Typical range width by occupation and task length') | ||||||
|  |     plt.tight_layout() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def cell6(): | ||||||
|  |     """ | ||||||
|  |     from scipy.stats import median_abs_deviation | ||||||
|  | 
 | ||||||
|  |     def mad_z(series): | ||||||
|  |         med = series.median() | ||||||
|  |         mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ | ||||||
|  |         return (series - med) / mad | ||||||
|  | 
 | ||||||
|  |     df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     agg = (atomic_tasks | ||||||
|  |            .groupby('onetsoc_code')['estimate_midpoint'] | ||||||
|  |            .agg(median='median', | ||||||
|  |                 q1=lambda x: x.quantile(.25), | ||||||
|  |                 q3=lambda x: x.quantile(.75), | ||||||
|  |                 mean='mean', | ||||||
|  |                 std='std') | ||||||
|  |            .reset_index()) | ||||||
|  |     agg['IQR'] = agg.q3 - agg.q1 | ||||||
|  |     agg['CV']  = agg['std'] / agg['mean']            # coefficient of variation | ||||||
|  | 
 | ||||||
|  |     # merge back the group mean and std so each row can be scored | ||||||
|  |     atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std'] | ||||||
|  |     outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3] | ||||||
|  |     outliers | ||||||
|  | 
 | ||||||
|  | def cell7(): | ||||||
|  |     from scipy.stats import median_abs_deviation | ||||||
|  | 
 | ||||||
|  |     def mad_z(series): | ||||||
|  |         med = series.median() | ||||||
|  |         mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ | ||||||
|  |         return (series - med) / mad | ||||||
|  | 
 | ||||||
|  |     atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) | ||||||
|  | 
 | ||||||
|  | def cell10(): | ||||||
|  |     import matplotlib.ticker as mtick # For percentage formatting | ||||||
|  |     import matplotlib.colors as mcolors # For color conversion | ||||||
|  | 
 | ||||||
|  |     summary_data = [] | ||||||
|  | 
 | ||||||
|  |     for code, label in occupation_major_codes.items(): | ||||||
|  |         occ_df = df_tasks[df_tasks['onetsoc_major'] == code] | ||||||
|  |         total_tasks_in_occ = len(occ_df) | ||||||
|  | 
 | ||||||
|  |         if total_tasks_in_occ == 0: | ||||||
|  |             continue # Skip if no tasks for this occupation | ||||||
|  | 
 | ||||||
|  |         # Stack 1: % that isn't equal to "remote" | ||||||
|  |         not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) | ||||||
|  | 
 | ||||||
|  |         # For the remaining remote tasks: | ||||||
|  |         remote_df = occ_df[occ_df['remote_status'] == 'remote'] | ||||||
|  | 
 | ||||||
|  |         # Stack 2: % of remote + ATOMIC | ||||||
|  |         remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC']) | ||||||
|  | 
 | ||||||
|  |         # Stack 3: % of remote + ONGOING-CONSTRAINT | ||||||
|  |         remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT']) | ||||||
|  | 
 | ||||||
|  |         summary_data.append({ | ||||||
|  |             'onetsoc_major_code': code, | ||||||
|  |             'occupation_label': label, | ||||||
|  |             'count_not_remote': not_remote_count, | ||||||
|  |             'count_remote_atomic': remote_atomic_count, | ||||||
|  |             'count_remote_ongoing': remote_ongoing_count, | ||||||
|  |             'total_tasks': total_tasks_in_occ | ||||||
|  |         }) | ||||||
|  | 
 | ||||||
|  |     summary_df = pd.DataFrame(summary_data) | ||||||
|  | 
 | ||||||
|  |     # --- 3. Calculate Percentages --- | ||||||
|  |     # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks | ||||||
|  |     summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning | ||||||
|  | 
 | ||||||
|  |     summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100 | ||||||
|  |     summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100 | ||||||
|  |     summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100 | ||||||
|  | 
 | ||||||
|  |     # Select columns for plotting and set index to occupation label | ||||||
|  |     plot_df = summary_df.set_index('occupation_label')[ | ||||||
|  |         ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing'] | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     # Rename columns for a clearer legend | ||||||
|  |     plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable'] | ||||||
|  | 
 | ||||||
|  |     plot_df = plot_df.sort_values(by='Not Remote', ascending=False) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # --- 4. Plotting (Modified) --- | ||||||
|  | 
 | ||||||
|  |     # Define the custom colors based on your requirements | ||||||
|  |     # The order must match the column order in plot_df: | ||||||
|  |     # 1. 'Not Remote' | ||||||
|  |     # 2. 'Remote & ATOMIC' | ||||||
|  |     # 3. 'Remote & ONGOING-CONSTRAINT' | ||||||
|  |     bar_colors = [gray["300"], lime["500"], lime["200"]] | ||||||
|  | 
 | ||||||
|  |     fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability | ||||||
|  | 
 | ||||||
|  |     plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors) | ||||||
|  | 
 | ||||||
|  |     ax.set_xlabel("Percentage of Tasks (%)", fontsize=12) | ||||||
|  |     ax.set_ylabel("Occupation Major Group", fontsize=12) | ||||||
|  |     ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20) | ||||||
|  | 
 | ||||||
|  |     # Format x-axis as percentages | ||||||
|  |     ax.xaxis.set_major_formatter(mtick.PercentFormatter()) | ||||||
|  |     plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100% | ||||||
|  | 
 | ||||||
|  |     # Remove right and top spines | ||||||
|  |     ax.spines['right'].set_visible(False) | ||||||
|  |     ax.spines['top'].set_visible(False) | ||||||
|  | 
 | ||||||
|  |     # Function to get contrasting text color | ||||||
|  |     def get_contrasting_text_color(bg_color_hex_or_rgba): | ||||||
|  |         """ | ||||||
|  |         Determines if black or white text provides better contrast against a given background color. | ||||||
|  |         bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]). | ||||||
|  |         Returns: 'black' or 'white'. | ||||||
|  |         """ | ||||||
|  |         # Convert to RGBA if it's a hex string or name | ||||||
|  |         if isinstance(bg_color_hex_or_rgba, str): | ||||||
|  |             rgba = mcolors.to_rgba(bg_color_hex_or_rgba) | ||||||
|  |         else: | ||||||
|  |             rgba = bg_color_hex_or_rgba | ||||||
|  | 
 | ||||||
|  |         r, g, b, _ = rgba # Ignore alpha for luminance calculation | ||||||
|  |         # Calculate luminance (standard formula for sRGB) | ||||||
|  |         # Values r, g, b should be in [0, 1] for this formula | ||||||
|  |         luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b | ||||||
|  |         # Threshold for deciding text color | ||||||
|  |         return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual | ||||||
|  | 
 | ||||||
|  |     # Add percentages inside each bar segment | ||||||
|  |     # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.) | ||||||
|  |     for i, container in enumerate(ax.containers): | ||||||
|  |         # Get the color for this container/category | ||||||
|  |         segment_color = bar_colors[i] | ||||||
|  |         text_color = get_contrasting_text_color(segment_color) | ||||||
|  | 
 | ||||||
|  |         for patch in container.patches: # Iterate through each bar segment in the category | ||||||
|  |             width = patch.get_width() | ||||||
|  |             if width > 3:  # Only add text if segment is wide enough (e.g., >3%) | ||||||
|  |                 x = patch.get_x() + width / 2 | ||||||
|  |                 y = patch.get_y() + patch.get_height() / 2 | ||||||
|  |                 ax.text(x, y, | ||||||
|  |                         f"{width:.1f}%", | ||||||
|  |                         ha='center', | ||||||
|  |                         va='center', | ||||||
|  |                         fontsize=8, # Adjust font size as needed | ||||||
|  |                         color=text_color, | ||||||
|  |                         fontweight='medium') # Bolder text can help | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False) | ||||||
|  | 
 | ||||||
|  | def cell11(): | ||||||
|  |     df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2] | ||||||
|  | 
 | ||||||
|  |     # Calculate wage bill per occupation | ||||||
|  |     # Wage bill = Total Employment * Annual Mean Wage | ||||||
|  |     # Ensure columns are numeric, converting non-numeric values to NaN first | ||||||
|  |     df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce') | ||||||
|  |     df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce') | ||||||
|  | 
 | ||||||
|  |     # Drop rows with NaN in necessary columns after coercion | ||||||
|  |     df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True) | ||||||
|  | 
 | ||||||
|  |     df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN'] | ||||||
|  | 
 | ||||||
|  |     # Aggregate wage bill by onetsoc_major | ||||||
|  |     df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index() | ||||||
|  | 
 | ||||||
|  |     # Map major codes to titles for better plotting | ||||||
|  |     df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes) | ||||||
|  | 
 | ||||||
|  |     # Sort by wage bill for better visualization | ||||||
|  |     df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False) | ||||||
|  | 
 | ||||||
|  |     # Plotting | ||||||
|  |     plt.figure(figsize=(12, 8)) | ||||||
|  |     sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis") | ||||||
|  |     plt.title('Total Wage Bill per Major Occupation Group') | ||||||
|  |     plt.xlabel('Total Wage Bill (in billions)') | ||||||
|  |     plt.ylabel('Major Occupation Group') | ||||||
|  |     plt.grid(axis='x', linestyle='--', alpha=0.7) | ||||||
|  | 
 | ||||||
|  | def cell11(): | ||||||
|  |     # ─────────────────────────────────────────────────────────────── | ||||||
|  |     # 1.  CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP | ||||||
|  |     # ─────────────────────────────────────────────────────────────── | ||||||
|  |     def cdf(series): | ||||||
|  |         s = series.sort_values().reset_index(drop=True) | ||||||
|  |         return s.values, ((s.index + 1) / len(s)) * 100 | ||||||
|  | 
 | ||||||
|  |     x_lb , y_lb  = cdf(atomic_tasks['lb_estimate_in_minutes']) | ||||||
|  |     x_ub , y_ub  = cdf(atomic_tasks['ub_estimate_in_minutes']) | ||||||
|  |     x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2) | ||||||
|  | 
 | ||||||
|  |     # ─────────────────────────────────────────────────────────────── | ||||||
|  |     # 2.  PLOTTING | ||||||
|  |     # ─────────────────────────────────────────────────────────────── | ||||||
|  |     fig, ax = plt.subplots(figsize=(10, 6)) | ||||||
|  | 
 | ||||||
|  |     # horizontal reference lines every 10 % | ||||||
|  |     for y_val in range(0, 101, 10): | ||||||
|  |         ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1) | ||||||
|  | 
 | ||||||
|  |     # Plot Lower Bound CDF | ||||||
|  |     ax.step(x_lb, y_lb, | ||||||
|  |             where='post', | ||||||
|  |             color=lime['300'], # Example: light blue for lower bound | ||||||
|  |             linewidth=1.8, | ||||||
|  |             linestyle='--', | ||||||
|  |             zorder=2, | ||||||
|  |             label='Lower bound estimate (CDF)') | ||||||
|  | 
 | ||||||
|  |     # Plot Upper Bound CDF | ||||||
|  |     ax.step(x_ub, y_ub, | ||||||
|  |             where='post', | ||||||
|  |             color=lime['900'], # Example: light orange/red for upper bound | ||||||
|  |             linewidth=1.8, | ||||||
|  |             linestyle=':', | ||||||
|  |             zorder=3, | ||||||
|  |             label='Upper bound estimate (CDF)') | ||||||
|  | 
 | ||||||
|  |     # Plot Midpoint CDF (plotted last to be on top, or adjust zorder) | ||||||
|  |     ax.step(x_mid, y_mid, | ||||||
|  |             where='post', | ||||||
|  |             color=lime['600'], | ||||||
|  |             linewidth=2.2, | ||||||
|  |             zorder=4, # Ensure it's on top of other lines if they overlap significantly | ||||||
|  |             label='Mid-point estimate (CDF)') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # axes limits / scales | ||||||
|  |     ax.set_ylim(0, 100) | ||||||
|  |     ax.set_xscale('log') | ||||||
|  | 
 | ||||||
|  |     # y-axis ➝ percent labels | ||||||
|  |     ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # move y-label to top-left (just inside plotting area) | ||||||
|  |     ax.text(-0.06, 1.03, | ||||||
|  |             "% of tasks with temporal coherence ≤ X", | ||||||
|  |             ha='left', va='bottom', | ||||||
|  |             transform=ax.transAxes, | ||||||
|  |             fontsize=12, fontweight='semibold') | ||||||
|  | 
 | ||||||
|  |     # custom x-ticks at human-friendly durations | ||||||
|  |     ticks      = [1, 5, 10, 30, 60, 120, 240, 480, | ||||||
|  |                 1440, 2880, 10080, 43200, 129600, | ||||||
|  |                 259200, 525600] | ||||||
|  |     ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours', | ||||||
|  |                 '1 day', '2 days', '1 week', '30 days', | ||||||
|  |                 '90 days', '180 days', '1 year'] | ||||||
|  | 
 | ||||||
|  |     # Vertical reference lines for x-ticks | ||||||
|  |     for tick in ticks: | ||||||
|  |         ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1) | ||||||
|  | 
 | ||||||
|  |     ax.set_xticks(ticks) | ||||||
|  |     ax.set_xticklabels(ticklabels, rotation=45, ha='right') | ||||||
|  | 
 | ||||||
|  |     ax.spines['top'].set_visible(False) | ||||||
|  |     ax.spines['right'].set_visible(False) | ||||||
|  |     ax.spines['left'].set_edgecolor(gray['300']) | ||||||
|  |     ax.spines['bottom'].set_edgecolor(gray['300']) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     # legend | ||||||
|  |     ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed | ||||||
|  | 
 | ||||||
|  |     ax.text(0.5, -0.3, | ||||||
|  |             'Temporal coherence (X)', | ||||||
|  |             ha='center', va='center', | ||||||
|  |             transform=ax.transAxes, | ||||||
|  |             fontsize=12, fontweight='semibold') | ||||||
							
								
								
									
										411
									
								
								old/classify_estimateability_of_tasks.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										411
									
								
								old/classify_estimateability_of_tasks.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,411 @@ | ||||||
|  | import pandas as pd | ||||||
|  | import litellm | ||||||
|  | import dotenv | ||||||
|  | import os | ||||||
|  | import time | ||||||
|  | import json | ||||||
|  | import math | ||||||
|  | 
 | ||||||
|  | # Load environment variables | ||||||
|  | dotenv.load_dotenv(override=True) | ||||||
|  | 
 | ||||||
|  | # litellm._turn_on_debug() # Optional debugging | ||||||
|  | 
 | ||||||
|  | # --- Configuration --- | ||||||
|  | MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output | ||||||
|  | RATE_LIMIT = 5000  # Requests per minute | ||||||
|  | CHUNK_SIZE = 300  # Number of unique tasks per API call | ||||||
|  | SECONDS_PER_MINUTE = 60 | ||||||
|  | 
 | ||||||
|  | # File configuration | ||||||
|  | CLASSIFICATION_FILENAME = "tasks_estimateable.csv"  # Output file with classifications | ||||||
|  | TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv" | ||||||
|  | OUTPUT_COLUMN_NAME = "task_estimateable" | ||||||
|  | SOURCE_FILTER_COLUMN = "remote_status" | ||||||
|  | SOURCE_FILTER_VALUE = "remote" | ||||||
|  | 
 | ||||||
|  | # --- Prompts and Schema --- | ||||||
|  | SYSTEM_PROMPT_CLASSIFY = """ | ||||||
|  | Classify the provided O*NET task into one of these categories: | ||||||
|  |  -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. | ||||||
|  |  -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). | ||||||
|  | """.strip() | ||||||
|  | 
 | ||||||
|  | USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}" | ||||||
|  | 
 | ||||||
|  | CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"] | ||||||
|  | 
 | ||||||
|  | SCHEMA_FOR_CLASSIFICATION = { | ||||||
|  |     "name": "classify_task_type", | ||||||
|  |     "strict": True, | ||||||
|  |     "schema": { | ||||||
|  |         "type": "object", | ||||||
|  |         "properties": { | ||||||
|  |             "task_category": { | ||||||
|  |                 "type": "string", | ||||||
|  |                 "enum": CLASSIFICATION_CATEGORIES, | ||||||
|  |                 "description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).", | ||||||
|  |             } | ||||||
|  |         }, | ||||||
|  |         "required": ["task_category"], | ||||||
|  |         "additionalProperties": False, | ||||||
|  |     }, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def save_dataframe(df_to_save, filename): | ||||||
|  |     """Saves the DataFrame to the specified CSV file using atomic write.""" | ||||||
|  |     try: | ||||||
|  |         temp_filename = filename + ".tmp" | ||||||
|  |         df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) | ||||||
|  |         os.replace(temp_filename, filename) | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"--- Error saving DataFrame to {filename}: {e} ---") | ||||||
|  |         if os.path.exists(temp_filename): | ||||||
|  |             try: | ||||||
|  |                 os.remove(temp_filename) | ||||||
|  |             except Exception as remove_err: | ||||||
|  |                 print( | ||||||
|  |                     f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Load or Initialize DataFrame --- | ||||||
|  | try: | ||||||
|  |     if os.path.exists(CLASSIFICATION_FILENAME): | ||||||
|  |         df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig") | ||||||
|  |         print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.") | ||||||
|  | 
 | ||||||
|  |         save_needed_after_load = False | ||||||
|  |         if OUTPUT_COLUMN_NAME not in df.columns: | ||||||
|  |             df[OUTPUT_COLUMN_NAME] = pd.NA | ||||||
|  |             print(f"Added '{OUTPUT_COLUMN_NAME}' column.") | ||||||
|  |             save_needed_after_load = True | ||||||
|  | 
 | ||||||
|  |         df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True) | ||||||
|  | 
 | ||||||
|  |         if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance( | ||||||
|  |             df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype | ||||||
|  |         ): | ||||||
|  |             try: | ||||||
|  |                 df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) | ||||||
|  |                 print( | ||||||
|  |                     f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}." | ||||||
|  |                 ) | ||||||
|  |                 save_needed_after_load = True | ||||||
|  |             except Exception as e: | ||||||
|  |                 print( | ||||||
|  |                     f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}." | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |         if "task" not in df.columns: | ||||||
|  |             print( | ||||||
|  |                 f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing." | ||||||
|  |             ) | ||||||
|  |             exit() | ||||||
|  | 
 | ||||||
|  |         if save_needed_after_load: | ||||||
|  |             print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.") | ||||||
|  |             save_dataframe(df, CLASSIFICATION_FILENAME) | ||||||
|  |     else: | ||||||
|  |         print( | ||||||
|  |             f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}." | ||||||
|  |         ) | ||||||
|  |         if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME): | ||||||
|  |             print( | ||||||
|  |                 f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}." | ||||||
|  |             ) | ||||||
|  |             exit() | ||||||
|  | 
 | ||||||
|  |         df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig") | ||||||
|  | 
 | ||||||
|  |         required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN] | ||||||
|  |         missing_source_cols = [ | ||||||
|  |             col for col in required_source_cols_for_init if col not in df_source.columns | ||||||
|  |         ] | ||||||
|  |         if missing_source_cols: | ||||||
|  |             print( | ||||||
|  |                 f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}." | ||||||
|  |             ) | ||||||
|  |             exit() | ||||||
|  | 
 | ||||||
|  |         df_source_filtered = df_source[ | ||||||
|  |             df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE | ||||||
|  |         ].copy() | ||||||
|  | 
 | ||||||
|  |         if df_source_filtered.empty: | ||||||
|  |             print( | ||||||
|  |                 f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. " | ||||||
|  |                 f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially." | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         df = df_source_filtered[["task"]].copy() | ||||||
|  |         df[OUTPUT_COLUMN_NAME] = pd.NA | ||||||
|  |         df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) | ||||||
|  | 
 | ||||||
|  |         print( | ||||||
|  |             f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} " | ||||||
|  |             f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks." | ||||||
|  |         ) | ||||||
|  |         save_dataframe(df, CLASSIFICATION_FILENAME) | ||||||
|  | 
 | ||||||
|  | except FileNotFoundError: | ||||||
|  |     print(f"Error: A required file was not found. Please check paths.") | ||||||
|  |     exit() | ||||||
|  | except Exception as e: | ||||||
|  |     print(f"Error during DataFrame loading or initialization: {e}") | ||||||
|  |     exit() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Identify Unique Tasks to Process --- | ||||||
|  | if df.empty: | ||||||
|  |     print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.") | ||||||
|  |     exit() | ||||||
|  | 
 | ||||||
|  | initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna() | ||||||
|  | 
 | ||||||
|  | if not initial_unprocessed_mask.any(): | ||||||
|  |     print( | ||||||
|  |         f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting." | ||||||
|  |     ) | ||||||
|  |     exit() | ||||||
|  | 
 | ||||||
|  | # Filter for rows that are unprocessed AND have a valid 'task' string | ||||||
|  | valid_tasks_to_consider_df = df[ | ||||||
|  |     initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "") | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | if valid_tasks_to_consider_df.empty: | ||||||
|  |     print( | ||||||
|  |         f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting." | ||||||
|  |     ) | ||||||
|  |     exit() | ||||||
|  | 
 | ||||||
|  | unique_task_labels_for_api = ( | ||||||
|  |     valid_tasks_to_consider_df["task"].drop_duplicates().tolist() | ||||||
|  | ) | ||||||
|  | total_rows_to_update_potentially = len( | ||||||
|  |     df[initial_unprocessed_mask] | ||||||
|  | )  # Count all rows that are NA | ||||||
|  | 
 | ||||||
|  | print( | ||||||
|  |     f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification." | ||||||
|  | ) | ||||||
|  | print( | ||||||
|  |     f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API." | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Prepare messages for batch completion (only for unique task labels) --- | ||||||
|  | messages_list = [] | ||||||
|  | print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...") | ||||||
|  | 
 | ||||||
|  | for task_label in unique_task_labels_for_api: | ||||||
|  |     # task_label is already guaranteed to be non-empty and not NaN from the filtering above | ||||||
|  |     user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label) | ||||||
|  |     messages_for_task = [ | ||||||
|  |         {"role": "system", "content": SYSTEM_PROMPT_CLASSIFY}, | ||||||
|  |         {"role": "user", "content": user_message}, | ||||||
|  |     ] | ||||||
|  |     messages_list.append(messages_for_task) | ||||||
|  | 
 | ||||||
|  | print(f"Prepared {len(messages_list)} message sets for batch completion.") | ||||||
|  | if ( | ||||||
|  |     not messages_list | ||||||
|  | ):  # Should only happen if unique_task_labels_for_api was empty, caught above | ||||||
|  |     print( | ||||||
|  |         "No messages prepared, though unique tasks were identified. This is unexpected. Exiting." | ||||||
|  |     ) | ||||||
|  |     exit() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Call batch_completion in chunks with rate limiting and periodic saving --- | ||||||
|  | total_unique_tasks_to_send = len( | ||||||
|  |     messages_list | ||||||
|  | )  # Same as len(unique_task_labels_for_api) | ||||||
|  | num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE) | ||||||
|  | 
 | ||||||
|  | print( | ||||||
|  |     f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..." | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | overall_start_time = time.time() | ||||||
|  | processed_rows_count_total = 0  # Counts actual rows updated in the DataFrame | ||||||
|  | 
 | ||||||
|  | for i in range(num_chunks): | ||||||
|  |     chunk_start_message_index = i * CHUNK_SIZE | ||||||
|  |     chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send) | ||||||
|  | 
 | ||||||
|  |     message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] | ||||||
|  |     # Get corresponding unique task labels for this chunk | ||||||
|  |     chunk_task_labels = unique_task_labels_for_api[ | ||||||
|  |         chunk_start_message_index:chunk_end_message_index | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     if not message_chunk:  # Should not happen if loop range is correct | ||||||
|  |         continue | ||||||
|  | 
 | ||||||
|  |     print( | ||||||
|  |         f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." | ||||||
|  |     ) | ||||||
|  |     chunk_start_time = time.time() | ||||||
|  |     responses = [] | ||||||
|  |     try: | ||||||
|  |         print( | ||||||
|  |             f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..." | ||||||
|  |         ) | ||||||
|  |         responses = litellm.batch_completion( | ||||||
|  |             model=MODEL, | ||||||
|  |             messages=message_chunk, | ||||||
|  |             response_format={ | ||||||
|  |                 "type": "json_schema", | ||||||
|  |                 "json_schema": SCHEMA_FOR_CLASSIFICATION, | ||||||
|  |             }, | ||||||
|  |             num_retries=3, | ||||||
|  |         ) | ||||||
|  |         print(f"Chunk {i + 1} API call completed.") | ||||||
|  | 
 | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") | ||||||
|  |         responses = [None] * len(message_chunk) | ||||||
|  | 
 | ||||||
|  |     # --- Process responses for the current chunk --- | ||||||
|  |     # chunk_updates stores {task_label: classification_category} | ||||||
|  |     chunk_task_classifications = {} | ||||||
|  |     successful_api_calls_in_chunk = 0 | ||||||
|  |     failed_api_calls_in_chunk = 0 | ||||||
|  | 
 | ||||||
|  |     if responses and len(responses) == len(message_chunk): | ||||||
|  |         for j, response in enumerate(responses): | ||||||
|  |             current_task_label = chunk_task_labels[ | ||||||
|  |                 j | ||||||
|  |             ]  # The unique task label for this response | ||||||
|  |             content_str = None | ||||||
|  | 
 | ||||||
|  |             if response is None: | ||||||
|  |                 print( | ||||||
|  |                     f"API call failed for task label '{current_task_label}' (response is None)." | ||||||
|  |                 ) | ||||||
|  |                 failed_api_calls_in_chunk += 1 | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             try: | ||||||
|  |                 if ( | ||||||
|  |                     response.choices | ||||||
|  |                     and response.choices[0].message | ||||||
|  |                     and response.choices[0].message.content | ||||||
|  |                 ): | ||||||
|  |                     content_str = response.choices[0].message.content | ||||||
|  |                     classification_data = json.loads(content_str) | ||||||
|  |                     category_raw = classification_data.get("task_category") | ||||||
|  | 
 | ||||||
|  |                     if category_raw in CLASSIFICATION_CATEGORIES: | ||||||
|  |                         successful_api_calls_in_chunk += 1 | ||||||
|  |                         chunk_task_classifications[current_task_label] = category_raw | ||||||
|  |                     else: | ||||||
|  |                         print( | ||||||
|  |                             f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'" | ||||||
|  |                         ) | ||||||
|  |                         failed_api_calls_in_chunk += 1 | ||||||
|  |                 else: | ||||||
|  |                     finish_reason = ( | ||||||
|  |                         response.choices[0].finish_reason | ||||||
|  |                         if (response.choices and response.choices[0].finish_reason) | ||||||
|  |                         else "unknown" | ||||||
|  |                     ) | ||||||
|  |                     error_message = ( | ||||||
|  |                         response.choices[0].message.content | ||||||
|  |                         if (response.choices and response.choices[0].message) | ||||||
|  |                         else "No content in message." | ||||||
|  |                     ) | ||||||
|  |                     print( | ||||||
|  |                         f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. " | ||||||
|  |                         f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" | ||||||
|  |                     ) | ||||||
|  |                     failed_api_calls_in_chunk += 1 | ||||||
|  | 
 | ||||||
|  |             except json.JSONDecodeError: | ||||||
|  |                 print( | ||||||
|  |                     f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'" | ||||||
|  |                 ) | ||||||
|  |                 failed_api_calls_in_chunk += 1 | ||||||
|  |             except AttributeError as ae: | ||||||
|  |                 print( | ||||||
|  |                     f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}" | ||||||
|  |                 ) | ||||||
|  |                 failed_api_calls_in_chunk += 1 | ||||||
|  |             except Exception as e: | ||||||
|  |                 print( | ||||||
|  |                     f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}" | ||||||
|  |                 ) | ||||||
|  |                 failed_api_calls_in_chunk += 1 | ||||||
|  |     else: | ||||||
|  |         print( | ||||||
|  |             f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) " | ||||||
|  |             f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed." | ||||||
|  |         ) | ||||||
|  |         failed_api_calls_in_chunk = len(message_chunk) | ||||||
|  | 
 | ||||||
|  |     # --- Update Main DataFrame and Save Periodically --- | ||||||
|  |     rows_updated_this_chunk = 0 | ||||||
|  |     if chunk_task_classifications: | ||||||
|  |         print( | ||||||
|  |             f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..." | ||||||
|  |         ) | ||||||
|  |         for task_label, category in chunk_task_classifications.items(): | ||||||
|  |             # Update all rows in the main df that match this task_label AND are still NA in the output column | ||||||
|  |             update_condition = (df["task"] == task_label) & ( | ||||||
|  |                 df[OUTPUT_COLUMN_NAME].isna() | ||||||
|  |             ) | ||||||
|  |             num_rows_for_this_task_label = df[update_condition].shape[0] | ||||||
|  | 
 | ||||||
|  |             if num_rows_for_this_task_label > 0: | ||||||
|  |                 df.loc[update_condition, OUTPUT_COLUMN_NAME] = category | ||||||
|  |                 rows_updated_this_chunk += num_rows_for_this_task_label | ||||||
|  | 
 | ||||||
|  |         print( | ||||||
|  |             f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses." | ||||||
|  |         ) | ||||||
|  |         print(f"Saving progress to {CLASSIFICATION_FILENAME}...") | ||||||
|  |         save_dataframe(df, CLASSIFICATION_FILENAME) | ||||||
|  |     else: | ||||||
|  |         print( | ||||||
|  |             f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save." | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     print( | ||||||
|  |         f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. " | ||||||
|  |         f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}" | ||||||
|  |     ) | ||||||
|  |     processed_rows_count_total += rows_updated_this_chunk | ||||||
|  | 
 | ||||||
|  |     # --- Rate Limiting Pause --- | ||||||
|  |     chunk_end_time = time.time() | ||||||
|  |     chunk_duration = chunk_end_time - chunk_start_time | ||||||
|  |     print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.") | ||||||
|  | 
 | ||||||
|  |     if i < num_chunks - 1: | ||||||
|  |         time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 | ||||||
|  |         min_chunk_duration_for_rate = ( | ||||||
|  |             len(message_chunk) * time_per_request | ||||||
|  |         )  # Based on API calls made | ||||||
|  |         pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) | ||||||
|  | 
 | ||||||
|  |         if pause_needed > 0: | ||||||
|  |             print( | ||||||
|  |                 f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." | ||||||
|  |             ) | ||||||
|  |             time.sleep(pause_needed) | ||||||
|  | 
 | ||||||
|  | overall_end_time = time.time() | ||||||
|  | total_duration_minutes = (overall_end_time - overall_start_time) / 60 | ||||||
|  | print( | ||||||
|  |     f"\nBatch classification finished." | ||||||
|  |     f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run." | ||||||
|  |     f" Total duration: {total_duration_minutes:.2f} minutes." | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | print(f"Performing final save to {CLASSIFICATION_FILENAME}...") | ||||||
|  | save_dataframe(df, CLASSIFICATION_FILENAME) | ||||||
|  | 
 | ||||||
|  | print("\nScript finished.") | ||||||
							
								
								
									
										85
									
								
								old/create_onet_database.sh
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										85
									
								
								old/create_onet_database.sh
									
										
									
									
									
										Executable file
									
								
							|  | @ -0,0 +1,85 @@ | ||||||
|  | #!/usr/bin/env bash | ||||||
|  | 
 | ||||||
|  | # Set database name and directories | ||||||
|  | ONET_DB_NAME="onet.database" | ||||||
|  | ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" | ||||||
|  | ONET_ZIP_FILE="db_29_1_mysql.zip" | ||||||
|  | ONET_EXTRACT_DIR="db_29_1_mysql" | ||||||
|  | 
 | ||||||
|  | # Download O*NET database only if not already downloaded | ||||||
|  | if [ ! -f "$ONET_ZIP_FILE" ]; then | ||||||
|  |     echo "Downloading O*NET database from $ONET_ZIP_URL" | ||||||
|  |     curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL" | ||||||
|  | 
 | ||||||
|  |     if [ $? -ne 0 ]; then | ||||||
|  |         echo "Failed to download O*NET database" | ||||||
|  |         exit 1 | ||||||
|  |     fi | ||||||
|  | else | ||||||
|  |     echo "Using existing O*NET database zip file" | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # Extract downloaded zip file only if extraction directory doesn't exist | ||||||
|  | if [ ! -d "$ONET_EXTRACT_DIR" ]; then | ||||||
|  |     echo "Extracting O*NET database files" | ||||||
|  |     unzip -o "$ONET_ZIP_FILE" | ||||||
|  | 
 | ||||||
|  |     if [ $? -ne 0 ]; then | ||||||
|  |         echo "Failed to extract O*NET database files" | ||||||
|  |         exit 1 | ||||||
|  |     fi | ||||||
|  | else | ||||||
|  |     echo "Using existing extracted O*NET database files" | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # Remove existing database if it exists | ||||||
|  | if [ -f "$ONET_DB_NAME" ]; then | ||||||
|  |     echo "Removing existing database" | ||||||
|  |     rm "$ONET_DB_NAME" | ||||||
|  | fi | ||||||
|  | 
 | ||||||
|  | # Create a new SQLite database with optimized settings for fast import | ||||||
|  | echo "Creating new SQLite database: $ONET_DB_NAME with performance settings" | ||||||
|  | sqlite3 "$ONET_DB_NAME" << EOF | ||||||
|  | PRAGMA journal_mode = OFF; | ||||||
|  | PRAGMA synchronous = 0; | ||||||
|  | PRAGMA cache_size = 1000000; | ||||||
|  | PRAGMA locking_mode = EXCLUSIVE; | ||||||
|  | PRAGMA temp_store = MEMORY; | ||||||
|  | PRAGMA foreign_keys = ON; | ||||||
|  | EOF | ||||||
|  | 
 | ||||||
|  | # Combine and execute all SQL files in one transaction | ||||||
|  | echo "Executing SQL files in alphabetical order (single transaction mode)" | ||||||
|  | sqlite3 "$ONET_DB_NAME" << EOF | ||||||
|  | BEGIN TRANSACTION; | ||||||
|  | $(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat) | ||||||
|  | COMMIT; | ||||||
|  | EOF | ||||||
|  | 
 | ||||||
|  | # Check if the execution was successful | ||||||
|  | if [ $? -ne 0 ]; then | ||||||
|  |     echo "Error executing SQL files in batch transaction" | ||||||
|  |     exit 1 | ||||||
|  | else | ||||||
|  |     echo "Database populated successfully. Restoring reliability settings..." | ||||||
|  | 
 | ||||||
|  |     # Restore reliability-focused settings after import | ||||||
|  |     sqlite3 "$ONET_DB_NAME" << EOF | ||||||
|  | PRAGMA journal_mode = WAL; | ||||||
|  | PRAGMA synchronous = NORMAL; | ||||||
|  | PRAGMA locking_mode = NORMAL; | ||||||
|  | PRAGMA temp_store = DEFAULT; | ||||||
|  | PRAGMA foreign_keys = ON; | ||||||
|  | PRAGMA optimize; | ||||||
|  | VACUUM; | ||||||
|  | EOF | ||||||
|  | 
 | ||||||
|  |     if [ $? -ne 0 ]; then | ||||||
|  |         echo "Warning: Failed to restore reliability settings, but database is populated" | ||||||
|  |     else | ||||||
|  |         echo "Reliability settings restored successfully" | ||||||
|  |     fi | ||||||
|  | 
 | ||||||
|  |     echo "O*NET database created and optimized successfully!" | ||||||
|  | fi | ||||||
							
								
								
									
										392
									
								
								old/enrich_task_ratings.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										392
									
								
								old/enrich_task_ratings.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,392 @@ | ||||||
|  | import sqlite3 | ||||||
|  | import pandas as pd | ||||||
|  | import json | ||||||
|  | import os | ||||||
|  | from collections import defaultdict | ||||||
|  | import numpy as np | ||||||
|  | 
 | ||||||
|  | # --- Configuration --- | ||||||
|  | DB_FILE = "onet.database" | ||||||
|  | OUTPUT_FILE = "task_ratings_enriched.json"  # Changed output filename | ||||||
|  | 
 | ||||||
|  | # --- Database Interaction --- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def fetch_data_from_db(db_path): | ||||||
|  |     """ | ||||||
|  |     Fetches required data from the O*NET SQLite database using JOINs, | ||||||
|  |     including DWAs. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         db_path (str): Path to the SQLite database file. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing: | ||||||
|  |             - DataFrame with task ratings info. | ||||||
|  |             - DataFrame with task-to-DWA mapping. | ||||||
|  |         Returns (None, None) if the database file doesn't exist or an error occurs. | ||||||
|  |     """ | ||||||
|  |     if not os.path.exists(db_path): | ||||||
|  |         print(f"Error: Database file not found at {db_path}") | ||||||
|  |         return None, None | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         conn = sqlite3.connect(db_path) | ||||||
|  |         # Construct the SQL query to join the tables and select necessary columns | ||||||
|  |         # Added LEFT JOINs for tasks_to_dwas and dwa_reference | ||||||
|  |         # Use LEFT JOIN in case a task has no DWAs | ||||||
|  |         query = """ | ||||||
|  |         SELECT | ||||||
|  |             tr.onetsoc_code, | ||||||
|  |             tr.task_id, | ||||||
|  |             ts.task, | ||||||
|  |             od.title AS occupation_title, | ||||||
|  |             od.description AS occupation_description, | ||||||
|  |             tr.scale_id, | ||||||
|  |             tr.category, | ||||||
|  |             tr.data_value, | ||||||
|  |             dr.dwa_title  -- Added DWA title | ||||||
|  |         FROM | ||||||
|  |             task_ratings tr | ||||||
|  |         JOIN | ||||||
|  |             task_statements ts ON tr.task_id = ts.task_id | ||||||
|  |         JOIN | ||||||
|  |             occupation_data od ON tr.onetsoc_code = od.onetsoc_code | ||||||
|  |         LEFT JOIN | ||||||
|  |             tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id -- | ||||||
|  |         LEFT JOIN | ||||||
|  |             dwa_reference dr ON td.dwa_id = dr.dwa_id; -- | ||||||
|  |         """ | ||||||
|  |         df = pd.read_sql_query(query, conn) | ||||||
|  |         conn.close() | ||||||
|  |         print( | ||||||
|  |                 f"Successfully fetched {len(df)} records (including DWA info) from the database." | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |         if df.empty: | ||||||
|  |             print("Warning: Fetched DataFrame is empty.") | ||||||
|  |             # Return empty DataFrames with expected columns if the main fetch is empty | ||||||
|  |             ratings_cols = [ | ||||||
|  |                 "onetsoc_code", | ||||||
|  |                 "task_id", | ||||||
|  |                 "task", | ||||||
|  |                 "occupation_title", | ||||||
|  |                 "occupation_description", | ||||||
|  |                 "scale_id", | ||||||
|  |                 "category", | ||||||
|  |                 "data_value", | ||||||
|  |             ] | ||||||
|  |             dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] | ||||||
|  |             return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols) | ||||||
|  | 
 | ||||||
|  |         # Remove duplicates caused by joining ratings with potentially multiple DWAs per task | ||||||
|  |         # Keep only unique combinations of the core task/rating info before processing | ||||||
|  |         core_cols = [ | ||||||
|  |             "onetsoc_code", | ||||||
|  |             "task_id", | ||||||
|  |             "task", | ||||||
|  |             "occupation_title", | ||||||
|  |             "occupation_description", | ||||||
|  |             "scale_id", | ||||||
|  |             "category", | ||||||
|  |             "data_value", | ||||||
|  |         ] | ||||||
|  |         # Check if all core columns exist before attempting to drop duplicates | ||||||
|  |         missing_core_cols = [col for col in core_cols if col not in df.columns] | ||||||
|  |         if missing_core_cols: | ||||||
|  |             print(f"Error: Missing core columns in fetched data: {missing_core_cols}") | ||||||
|  |             return None, None | ||||||
|  |         ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True) | ||||||
|  | 
 | ||||||
|  |         # Get unique DWA info separately | ||||||
|  |         dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] | ||||||
|  |         # Check if all DWA columns exist before processing | ||||||
|  |         if all(col in df.columns for col in dwa_cols): | ||||||
|  |             dwas_df = ( | ||||||
|  |                 df[dwa_cols] | ||||||
|  |                 .dropna(subset=["dwa_title"]) | ||||||
|  |                 .drop_duplicates() | ||||||
|  |                 .reset_index(drop=True) | ||||||
|  |             ) | ||||||
|  |         else: | ||||||
|  |             print("Warning: DWA related columns missing, creating empty DWA DataFrame.") | ||||||
|  |             dwas_df = pd.DataFrame( | ||||||
|  |                 columns=dwa_cols | ||||||
|  |             )  # Create empty df if columns missing | ||||||
|  | 
 | ||||||
|  |         return ratings_df, dwas_df  # Return two dataframes now | ||||||
|  | 
 | ||||||
|  |     except sqlite3.Error as e: | ||||||
|  |         print(f"SQLite error: {e}") | ||||||
|  |         if "conn" in locals() and conn: | ||||||
|  |             conn.close() | ||||||
|  |         return None, None  # Return None for both if error | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"An error occurred during data fetching: {e}") | ||||||
|  |         if "conn" in locals() and conn: | ||||||
|  |             conn.close() | ||||||
|  |         return None, None  # Return None for both if error | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Data Processing --- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def process_task_ratings_with_dwas(ratings_df, dwas_df): | ||||||
|  |     """ | ||||||
|  |     Processes the fetched data to group, pivot frequency, calculate averages, | ||||||
|  |     structure the output, and add associated DWAs. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         ratings_df (pandas.DataFrame): The input DataFrame with task ratings info. | ||||||
|  |         dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty. | ||||||
|  | 
 | ||||||
|  |     Returns: | ||||||
|  |         list: A list of dictionaries, each representing an enriched task rating with DWAs. | ||||||
|  |               Returns None if the input ratings DataFrame is invalid. | ||||||
|  |     """ | ||||||
|  |     if ratings_df is None or not isinstance( | ||||||
|  |         ratings_df, pd.DataFrame | ||||||
|  |     ):  # Check if it's a DataFrame | ||||||
|  |         print("Error: Input ratings DataFrame is invalid.") | ||||||
|  |         return None | ||||||
|  |     if ratings_df.empty: | ||||||
|  |         print( | ||||||
|  |             "Warning: Input ratings DataFrame is empty. Processing will yield empty result." | ||||||
|  |         ) | ||||||
|  |         # Decide how to handle empty input, maybe return empty list directly | ||||||
|  |         # return [] | ||||||
|  | 
 | ||||||
|  |     # Ensure dwas_df is a DataFrame, even if empty | ||||||
|  |     if dwas_df is None or not isinstance(dwas_df, pd.DataFrame): | ||||||
|  |         print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.") | ||||||
|  |         dwas_df = pd.DataFrame( | ||||||
|  |             columns=["onetsoc_code", "task_id", "dwa_title"] | ||||||
|  |         )  # Ensure it's an empty DF | ||||||
|  | 
 | ||||||
|  |     print("Starting data processing...") | ||||||
|  | 
 | ||||||
|  |     # --- 1. Handle Frequency (FT) --- | ||||||
|  |     freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() | ||||||
|  |     if not freq_df.empty: | ||||||
|  |         freq_pivot = freq_df.pivot_table( | ||||||
|  |             index=["onetsoc_code", "task_id"], | ||||||
|  |             columns="category", | ||||||
|  |             values="data_value", | ||||||
|  |             fill_value=0, | ||||||
|  |         ) | ||||||
|  |         freq_pivot.columns = [ | ||||||
|  |             f"frequency_category_{int(col)}" for col in freq_pivot.columns | ||||||
|  |         ] | ||||||
|  |         print(f"Processed Frequency data. Shape: {freq_pivot.shape}") | ||||||
|  |     else: | ||||||
|  |         print("No Frequency (FT) data found.") | ||||||
|  |         # Create an empty DataFrame with the multi-index to allow merging later | ||||||
|  |         idx = pd.MultiIndex( | ||||||
|  |             levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] | ||||||
|  |         ) | ||||||
|  |         freq_pivot = pd.DataFrame(index=idx) | ||||||
|  | 
 | ||||||
|  |     # --- 2. Handle Importance (IM, IJ) --- | ||||||
|  |     imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() | ||||||
|  |     if not imp_df.empty: | ||||||
|  |         imp_avg = ( | ||||||
|  |             imp_df.groupby(["onetsoc_code", "task_id"])["data_value"] | ||||||
|  |             .mean() | ||||||
|  |             .reset_index() | ||||||
|  |         ) | ||||||
|  |         imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) | ||||||
|  |         print(f"Processed Importance data. Shape: {imp_avg.shape}") | ||||||
|  |     else: | ||||||
|  |         print("No Importance (IM, IJ) data found.") | ||||||
|  |         imp_avg = pd.DataFrame( | ||||||
|  |             columns=["onetsoc_code", "task_id", "importance_average"] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # --- 3. Handle Relevance (RT) --- | ||||||
|  |     rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() | ||||||
|  |     if not rel_df.empty: | ||||||
|  |         rel_avg = ( | ||||||
|  |             rel_df.groupby(["onetsoc_code", "task_id"])["data_value"] | ||||||
|  |             .mean() | ||||||
|  |             .reset_index() | ||||||
|  |         ) | ||||||
|  |         rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) | ||||||
|  |         print(f"Processed Relevance data. Shape: {rel_avg.shape}") | ||||||
|  |     else: | ||||||
|  |         print("No Relevance (RT) data found.") | ||||||
|  |         rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"]) | ||||||
|  | 
 | ||||||
|  |     # --- 4. Process DWAs --- | ||||||
|  |     if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns: | ||||||
|  |         print("Processing DWA data...") | ||||||
|  |         # Group DWAs by task_id and aggregate titles into a list | ||||||
|  |         dwas_grouped = ( | ||||||
|  |             dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"] | ||||||
|  |             .apply(list) | ||||||
|  |             .reset_index() | ||||||
|  |         )  # | ||||||
|  |         dwas_grouped.rename( | ||||||
|  |             columns={"dwa_title": "dwas"}, inplace=True | ||||||
|  |         )  # Rename column to 'dwas' | ||||||
|  |         print(f"Processed DWA data. Shape: {dwas_grouped.shape}") | ||||||
|  |     else: | ||||||
|  |         print("No valid DWA data found or provided for processing.") | ||||||
|  |         dwas_grouped = None  # Set to None if no DWAs | ||||||
|  | 
 | ||||||
|  |     # --- 5. Get Base Task/Occupation Info --- | ||||||
|  |     base_cols = [ | ||||||
|  |         "onetsoc_code", | ||||||
|  |         "task_id", | ||||||
|  |         "task", | ||||||
|  |         "occupation_title", | ||||||
|  |         "occupation_description", | ||||||
|  |     ] | ||||||
|  |     # Check if base columns exist in ratings_df | ||||||
|  |     missing_base_cols = [col for col in base_cols if col not in ratings_df.columns] | ||||||
|  |     if missing_base_cols: | ||||||
|  |         print( | ||||||
|  |             f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed." | ||||||
|  |         ) | ||||||
|  |         return None | ||||||
|  |     if not ratings_df.empty: | ||||||
|  |         base_info = ( | ||||||
|  |             ratings_df[base_cols] | ||||||
|  |             .drop_duplicates() | ||||||
|  |             .set_index(["onetsoc_code", "task_id"]) | ||||||
|  |         ) | ||||||
|  |         print(f"Extracted base info. Shape: {base_info.shape}") | ||||||
|  |     else: | ||||||
|  |         print("Cannot extract base info from empty ratings DataFrame.") | ||||||
|  |         # Create an empty df with index to avoid errors later if possible | ||||||
|  |         idx = pd.MultiIndex( | ||||||
|  |             levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] | ||||||
|  |         ) | ||||||
|  |         base_info = pd.DataFrame( | ||||||
|  |             index=idx, | ||||||
|  |             columns=[ | ||||||
|  |                 col for col in base_cols if col not in ["onetsoc_code", "task_id"] | ||||||
|  |             ], | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     # --- 6. Merge Processed Data --- | ||||||
|  |     print("Merging processed data...") | ||||||
|  |     # Start with base_info, which should have the index ['onetsoc_code', 'task_id'] | ||||||
|  |     final_df = base_info.merge( | ||||||
|  |         freq_pivot, left_index=True, right_index=True, how="left" | ||||||
|  |     ) | ||||||
|  |     # Reset index before merging non-indexed dfs | ||||||
|  |     final_df = final_df.reset_index() | ||||||
|  | 
 | ||||||
|  |     # Merge averages - check if they are not empty before merging | ||||||
|  |     if not imp_avg.empty: | ||||||
|  |         final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") | ||||||
|  |     else: | ||||||
|  |         final_df["importance_average"] = np.nan  # Add column if imp_avg was empty | ||||||
|  | 
 | ||||||
|  |     if not rel_avg.empty: | ||||||
|  |         final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") | ||||||
|  |     else: | ||||||
|  |         final_df["relevance_average"] = np.nan  # Add column if rel_avg was empty | ||||||
|  | 
 | ||||||
|  |     # Merge DWAs if available | ||||||
|  |     if dwas_grouped is not None and not dwas_grouped.empty: | ||||||
|  |         final_df = final_df.merge( | ||||||
|  |             dwas_grouped, on=["onetsoc_code", "task_id"], how="left" | ||||||
|  |         )  # Merge the dwas list | ||||||
|  |         # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists | ||||||
|  |         # Check if 'dwas' column exists before applying function | ||||||
|  |         if "dwas" in final_df.columns: | ||||||
|  |             final_df["dwas"] = final_df["dwas"].apply( | ||||||
|  |                 lambda x: x if isinstance(x, list) else [] | ||||||
|  |             )  # Ensure tasks without DWAs get [] | ||||||
|  |         else: | ||||||
|  |             print("Warning: 'dwas' column not created during merge.") | ||||||
|  |             final_df["dwas"] = [ | ||||||
|  |                 [] for _ in range(len(final_df)) | ||||||
|  |             ]  # Add empty list column | ||||||
|  | 
 | ||||||
|  |     else: | ||||||
|  |         # Add an empty 'dwas' column if no DWA data was processed or merged | ||||||
|  |         final_df["dwas"] = [[] for _ in range(len(final_df))] | ||||||
|  | 
 | ||||||
|  |     print(f"Final merged data shape: {final_df.shape}") | ||||||
|  | 
 | ||||||
|  |     # Convert DataFrame to list of dictionaries for JSON output | ||||||
|  |     # Handle potential NaN values during JSON conversion | ||||||
|  |     # Replace numpy NaN with Python None for JSON compatibility | ||||||
|  |     final_df = final_df.replace({np.nan: None}) | ||||||
|  |     result_list = final_df.to_dict(orient="records") | ||||||
|  | 
 | ||||||
|  |     return result_list | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Output --- | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def write_to_json(data, output_path): | ||||||
|  |     """ | ||||||
|  |     Writes the processed data to a JSON file. | ||||||
|  | 
 | ||||||
|  |     Args: | ||||||
|  |         data (list): The list of dictionaries to write. | ||||||
|  |         output_path (str): Path to the output JSON file. | ||||||
|  |     """ | ||||||
|  |     if data is None: | ||||||
|  |         print("No data to write to JSON.") | ||||||
|  |         return | ||||||
|  |     if not isinstance(data, list): | ||||||
|  |         print( | ||||||
|  |             f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON." | ||||||
|  |         ) | ||||||
|  |         return | ||||||
|  | 
 | ||||||
|  |     # Create directory if it doesn't exist | ||||||
|  |     output_dir = os.path.dirname(output_path) | ||||||
|  |     if output_dir and not os.path.exists(output_dir): | ||||||
|  |         try: | ||||||
|  |             os.makedirs(output_dir) | ||||||
|  |             print(f"Created output directory: {output_dir}") | ||||||
|  |         except OSError as e: | ||||||
|  |             print(f"Error creating output directory {output_dir}: {e}") | ||||||
|  |             return  # Exit if cannot create directory | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         with open(output_path, "w", encoding="utf-8") as f: | ||||||
|  |             json.dump(data, f, indent=4, ensure_ascii=False) | ||||||
|  |         print(f"Successfully wrote enriched data to {output_path}") | ||||||
|  |     except IOError as e: | ||||||
|  |         print(f"Error writing JSON file to {output_path}: {e}") | ||||||
|  |     except TypeError as e: | ||||||
|  |         print(f"Error during JSON serialization: {e}. Check data types.") | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"An unexpected error occurred during JSON writing: {e}") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # --- Main Execution --- | ||||||
|  | 
 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     print("Starting O*NET Task Ratings & DWAs Enrichment Script...") | ||||||
|  |     # 1. Fetch data | ||||||
|  |     ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE)  # Fetch both datasets | ||||||
|  | 
 | ||||||
|  |     # 2. Process data | ||||||
|  |     # Proceed only if ratings_data_df is a valid DataFrame (even if empty) | ||||||
|  |     # dwas_data_df can be None or empty, handled inside process function | ||||||
|  |     if isinstance(ratings_data_df, pd.DataFrame): | ||||||
|  |         enriched_data = process_task_ratings_with_dwas( | ||||||
|  |             ratings_data_df, dwas_data_df | ||||||
|  |         )  # Pass both dataframes | ||||||
|  | 
 | ||||||
|  |         # 3. Write output | ||||||
|  |         if ( | ||||||
|  |             enriched_data is not None | ||||||
|  |         ):  # Check if processing returned data (even an empty list is valid) | ||||||
|  |             write_to_json(enriched_data, OUTPUT_FILE) | ||||||
|  |         else: | ||||||
|  |             print("Data processing failed or returned None. No output file generated.") | ||||||
|  |     else: | ||||||
|  |         print( | ||||||
|  |             "Data fetching failed or returned invalid type for ratings data. Script terminated." | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     print("Script finished.") | ||||||
|  | @ -7,6 +7,7 @@ from  .run  import Run | ||||||
| import pandas as pd | import pandas as pd | ||||||
| 
 | 
 | ||||||
| def enrich_with_task_estimateability(run: Run) -> pd.DataFrame: | def enrich_with_task_estimateability(run: Run) -> pd.DataFrame: | ||||||
|  |     run.metadata. | ||||||
|     raise NotImplementedError |     raise NotImplementedError | ||||||
| 
 | 
 | ||||||
| def enrich_with_task_estimates(run: Run) -> pd.DataFrame: | def enrich_with_task_estimates(run: Run) -> pd.DataFrame: | ||||||
|  |  | ||||||
|  | @ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro | ||||||
| import sqlite3 | import sqlite3 | ||||||
| from typing import Tuple | from typing import Tuple | ||||||
| import pandas as pd | import pandas as pd | ||||||
| from .metadata import Metadata | import requests | ||||||
|  | import hashlib | ||||||
|  | import io | ||||||
|  | import zipfile | ||||||
|  | from .run import Run | ||||||
|  | from .logger import logger | ||||||
| 
 | 
 | ||||||
| def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]: | def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: | ||||||
|     raise NotImplementedError |     """ | ||||||
|  |     Downloads the O*NET database, creates a local SQLite file from it, and returns a connection. | ||||||
|  |     The version is the sha256 of the downloaded zip file. | ||||||
|  |     """ | ||||||
|  |     url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" | ||||||
|  |     logger.info(f"Downloading O*NET database from {url}") | ||||||
|  |     response = requests.get(url, stream=True) | ||||||
|  |     response.raise_for_status() | ||||||
| 
 | 
 | ||||||
| def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]: |     # Read content into memory | ||||||
|     raise NotImplementedError |     zip_content = response.content | ||||||
|  |     version = hashlib.sha256(zip_content).hexdigest() | ||||||
|  |     logger.info(f"O*NET database version (sha256): {version}") | ||||||
| 
 | 
 | ||||||
| def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]: |     db_path = run.cache_dir / f"onet_{version}.db" | ||||||
|     raise NotImplementedError | 
 | ||||||
|  |     if db_path.exists(): | ||||||
|  |         logger.info(f"Using cached O*NET database: {db_path}") | ||||||
|  |         conn = sqlite3.connect(db_path) | ||||||
|  |         # Set PRAGMA for foreign keys on every connection | ||||||
|  |         conn.execute("PRAGMA foreign_keys = ON;") | ||||||
|  |         return conn, version | ||||||
|  | 
 | ||||||
|  |     logger.info(f"Creating new O*NET database: {db_path}") | ||||||
|  |     conn = sqlite3.connect(db_path) | ||||||
|  | 
 | ||||||
|  |     # Set performance PRAGMAs for fast import | ||||||
|  |     logger.info("Creating new SQLite database with performance settings") | ||||||
|  |     conn.executescript(""" | ||||||
|  |         PRAGMA journal_mode = OFF; | ||||||
|  |         PRAGMA synchronous = 0; | ||||||
|  |         PRAGMA cache_size = 1000000; | ||||||
|  |         PRAGMA locking_mode = EXCLUSIVE; | ||||||
|  |         PRAGMA temp_store = MEMORY; | ||||||
|  |         PRAGMA foreign_keys = ON; | ||||||
|  |     """) | ||||||
|  | 
 | ||||||
|  |     with zipfile.ZipFile(io.BytesIO(zip_content)) as z: | ||||||
|  |         sql_scripts = [] | ||||||
|  |         for filename in sorted(z.namelist()): | ||||||
|  |             if filename.endswith(".sql"): | ||||||
|  |                 sql_scripts.append(z.read(filename).decode('utf-8')) | ||||||
|  | 
 | ||||||
|  |         if not sql_scripts: | ||||||
|  |             raise RuntimeError("No SQL files found in the O*NET zip archive.") | ||||||
|  | 
 | ||||||
|  |         # Combine and execute all SQL files in one transaction | ||||||
|  |         full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" | ||||||
|  | 
 | ||||||
|  |         logger.info("Executing SQL files in alphabetical order (single transaction mode)") | ||||||
|  |         conn.executescript(full_script) | ||||||
|  |         logger.info("Database populated successfully. Restoring reliability settings...") | ||||||
|  | 
 | ||||||
|  |     # Restore reliability-focused settings after import | ||||||
|  |     conn.executescript(""" | ||||||
|  |         PRAGMA journal_mode = WAL; | ||||||
|  |         PRAGMA synchronous = NORMAL; | ||||||
|  |         PRAGMA locking_mode = NORMAL; | ||||||
|  |         PRAGMA temp_store = DEFAULT; | ||||||
|  |         PRAGMA foreign_keys = ON; | ||||||
|  |         PRAGMA optimize; | ||||||
|  |     """) | ||||||
|  |     conn.execute("VACUUM;") | ||||||
|  |     conn.commit() | ||||||
|  |     logger.info("Reliability settings restored and database optimized successfully!") | ||||||
|  | 
 | ||||||
|  |     return conn, version | ||||||
|  | 
 | ||||||
|  | def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]: | ||||||
|  |     """ | ||||||
|  |     Downloads the OESM national data from the BLS website. | ||||||
|  |     The version is the sha256 of the downloaded zip file. | ||||||
|  |     """ | ||||||
|  |     url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip" | ||||||
|  |     logger.info(f"Downloading OESM data from {url}") | ||||||
|  |     response = requests.get(url) | ||||||
|  |     response.raise_for_status() | ||||||
|  | 
 | ||||||
|  |     zip_content = response.content | ||||||
|  |     version = hashlib.sha256(zip_content).hexdigest() | ||||||
|  |     logger.info(f"OESM data version (sha256): {version}") | ||||||
|  | 
 | ||||||
|  |     parquet_path = run.cache_dir / f"oesm_{version}.parquet" | ||||||
|  |     if parquet_path.exists(): | ||||||
|  |         logger.info(f"Using cached OESM data: {parquet_path}") | ||||||
|  |         return pd.read_parquet(parquet_path), version | ||||||
|  | 
 | ||||||
|  |     logger.info(f"Creating new OESM data cache: {parquet_path}") | ||||||
|  |     with zipfile.ZipFile(io.BytesIO(zip_content)) as z: | ||||||
|  |         # Find the excel file in the zip | ||||||
|  |         excel_filename = None | ||||||
|  |         for filename in z.namelist(): | ||||||
|  |             logger.debug(f"Found file in OESM zip: {filename}") | ||||||
|  |             if filename.lower().endswith(".xlsx"): | ||||||
|  |                 excel_filename = filename | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |         if excel_filename is None: | ||||||
|  |             raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.") | ||||||
|  | 
 | ||||||
|  |         logger.info(f"Reading {excel_filename} from zip archive.") | ||||||
|  |         with z.open(excel_filename) as f: | ||||||
|  |             df = pd.read_excel(f, engine='openpyxl') | ||||||
|  | 
 | ||||||
|  |     df.to_parquet(parquet_path) | ||||||
|  |     logger.info(f"Saved OESM data to cache: {parquet_path}") | ||||||
|  |     return df, version | ||||||
|  | 
 | ||||||
|  | def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]: | ||||||
|  |     """ | ||||||
|  |     Downloads the EPOCH AI remote work task data. | ||||||
|  |     The version is the sha256 of the downloaded CSV file. | ||||||
|  |     """ | ||||||
|  |     # This is the direct download link constructed from the Google Drive share link | ||||||
|  |     url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" | ||||||
|  |     logger.info(f"Downloading EPOCH remote data from Google Drive: {url}") | ||||||
|  | 
 | ||||||
|  |     # Need to handle potential cookies/redirects from Google Drive | ||||||
|  |     session = requests.Session() | ||||||
|  |     response = session.get(url, stream=True) | ||||||
|  |     response.raise_for_status() | ||||||
|  | 
 | ||||||
|  |     csv_content = response.content | ||||||
|  |     version = hashlib.sha256(csv_content).hexdigest() | ||||||
|  |     logger.info(f"EPOCH remote data version (sha256): {version}") | ||||||
|  | 
 | ||||||
|  |     parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet" | ||||||
|  |     if parquet_path.exists(): | ||||||
|  |         logger.info(f"Using cached EPOCH remote data: {parquet_path}") | ||||||
|  |         return pd.read_parquet(parquet_path), version | ||||||
|  | 
 | ||||||
|  |     logger.info(f"Creating new EPOCH remote data cache: {parquet_path}") | ||||||
|  |     df = pd.read_csv(io.BytesIO(csv_content)) | ||||||
|  |     df.to_parquet(parquet_path) | ||||||
|  |     logger.info(f"Saved EPOCH remote data to cache: {parquet_path}") | ||||||
|  | 
 | ||||||
|  |     return df, version | ||||||
|  |  | ||||||
|  | @ -2,5 +2,5 @@ from ..run import Run | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typing import Generator | from typing import Generator | ||||||
| 
 | 
 | ||||||
| def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]: | def generate_estimate_histplot(run: Run) -> Generator[Path]: | ||||||
|     raise NotImplementedError |     raise NotImplementedError | ||||||
|  |  | ||||||
							
								
								
									
										24
									
								
								pipeline/logger.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								pipeline/logger.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | ||||||
|  | import logging | ||||||
|  | from logging.handlers import RotatingFileHandler | ||||||
|  | from rich.logging import RichHandler | ||||||
|  | 
 | ||||||
|  | LOGGER_NAME = "pipeline" | ||||||
|  | 
 | ||||||
|  | def setup_logging() -> logging.Logger: | ||||||
|  |     # Set up Rich console handler | ||||||
|  |     rich_handler = RichHandler( | ||||||
|  |         level=logging.DEBUG, | ||||||
|  |         show_time=True, | ||||||
|  |         enable_link_path=True, | ||||||
|  |         rich_tracebacks=True, | ||||||
|  |         # omit_repeated_times=False, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     logger = logging.getLogger(LOGGER_NAME) | ||||||
|  |     logger.setLevel(logging.DEBUG) | ||||||
|  |     logger.addHandler(rich_handler) | ||||||
|  | 
 | ||||||
|  |     return logger | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | logger = setup_logging() | ||||||
|  | @ -16,6 +16,7 @@ class Metadata(BaseModel): | ||||||
|     versions, and other important information. |     versions, and other important information. | ||||||
|     """ |     """ | ||||||
|     fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict) |     fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict) | ||||||
|  |     enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict) | ||||||
| 
 | 
 | ||||||
|     ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S")) |     ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S")) | ||||||
|     commit: str = Field(default_factory=lambda: _get_current_commit()) |     commit: str = Field(default_factory=lambda: _get_current_commit()) | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| from pydantic import BaseModel, Field | from pydantic import BaseModel, Field | ||||||
| import sqlite3 | import sqlite3 | ||||||
| import pandas as pd | import pandas as pd | ||||||
|  | from pathlib import Path | ||||||
| from typing import Optional | from typing import Optional | ||||||
| from .metadata import Metadata | from .metadata import Metadata | ||||||
| 
 | 
 | ||||||
|  | @ -20,3 +21,6 @@ class Run(BaseModel): | ||||||
|     task_estimates_df: Optional[pd.DataFrame] = None |     task_estimates_df: Optional[pd.DataFrame] = None | ||||||
| 
 | 
 | ||||||
|     meta: Metadata = Field(default_factory=Metadata) |     meta: Metadata = Field(default_factory=Metadata) | ||||||
|  | 
 | ||||||
|  |     cache_dir: Path | ||||||
|  |     output_dir: Path | ||||||
|  |  | ||||||
|  | @ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks | ||||||
| from .generators import GENERATORS | from .generators import GENERATORS | ||||||
| from .run import Run | from .run import Run | ||||||
| from .constants import GRAY | from .constants import GRAY | ||||||
|  | import platformdirs | ||||||
| import seaborn as sns | import seaborn as sns | ||||||
| import matplotlib as mpl | import matplotlib as mpl | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from typings import Optional | from typings import Optional | ||||||
| 
 | 
 | ||||||
|  | CACHE_DIR = platformdirs.user_cache_dir("econtai") | ||||||
|  | 
 | ||||||
| def run(output_dir: Optional[str] = None): | def run(output_dir: Optional[str] = None): | ||||||
|     if output_dir is None: |     if output_dir is None: | ||||||
|         output_dir = Path(".") |         output_dir = Path(".") | ||||||
|  | @ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None): | ||||||
|     load_dotenv() |     load_dotenv() | ||||||
|     _setup_graph_rendering() |     _setup_graph_rendering() | ||||||
| 
 | 
 | ||||||
|     current_run = Run() |     current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR) | ||||||
| 
 | 
 | ||||||
|     # Fetchers (fetchers.py) |     # Fetchers (fetchers.py) | ||||||
|     current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta) |     current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run) | ||||||
|     current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta) |     current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run) | ||||||
|     current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta) |     current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run) | ||||||
| 
 | 
 | ||||||
|     # Enrichments (enrichments.py) |     # Enrichments (enrichments.py) | ||||||
|     current_run.task_estimateability_df = enrich_with_task_estimateability(current_run) |     current_run.task_estimateability_df = enrich_with_task_estimateability(current_run) | ||||||
|  | @ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None): | ||||||
| 
 | 
 | ||||||
|     # Generators (generators/) |     # Generators (generators/) | ||||||
|     for gen in GENERATORS: |     for gen in GENERATORS: | ||||||
|         gen(current_run, output_dir) |         gen(current_run) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _setup_graph_rendering(): | def _setup_graph_rendering(): | ||||||
|  |  | ||||||
|  | @ -6,9 +6,12 @@ readme = "README.md" | ||||||
| requires-python = ">=3.13" | requires-python = ">=3.13" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|     "matplotlib>=3.10.3", |     "matplotlib>=3.10.3", | ||||||
|  |     "openpyxl>=3.1.5", | ||||||
|     "pandas>=2.2.3", |     "pandas>=2.2.3", | ||||||
|  |     "platformdirs>=4.3.8", | ||||||
|     "pydantic>=2.11.7", |     "pydantic>=2.11.7", | ||||||
|     "python-dotenv>=1.1.1", |     "python-dotenv>=1.1.1", | ||||||
|  |     "requests>=2.32.4", | ||||||
|     "seaborn>=0.13.2", |     "seaborn>=0.13.2", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										100
									
								
								uv.lock
									
										
									
										generated
									
									
									
								
							
							
						
						
									
										100
									
								
								uv.lock
									
										
									
										generated
									
									
									
								
							|  | @ -11,6 +11,37 @@ wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" }, |     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "certifi" | ||||||
|  | version = "2025.6.15" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload_time = "2025-06-15T02:45:51.329Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload_time = "2025-06-15T02:45:49.977Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "charset-normalizer" | ||||||
|  | version = "3.4.2" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" }, | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "contourpy" | name = "contourpy" | ||||||
| version = "1.3.2" | version = "1.3.2" | ||||||
|  | @ -51,6 +82,15 @@ wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload_time = "2023-10-07T05:32:16.783Z" }, |     { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload_time = "2023-10-07T05:32:16.783Z" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "et-xmlfile" | ||||||
|  | version = "2.0.0" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload_time = "2024-10-25T17:25:40.039Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload_time = "2024-10-25T17:25:39.051Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "fonttools" | name = "fonttools" | ||||||
| version = "4.58.5" | version = "4.58.5" | ||||||
|  | @ -68,6 +108,15 @@ wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/d7/d4/1d85a1996b6188cd2713230e002d79a6f3a289bb17cef600cba385848b72/fonttools-4.58.5-py3-none-any.whl", hash = "sha256:e48a487ed24d9b611c5c4b25db1e50e69e9854ca2670e39a3486ffcd98863ec4", size = 1115318, upload_time = "2025-07-03T14:04:45.378Z" }, |     { url = "https://files.pythonhosted.org/packages/d7/d4/1d85a1996b6188cd2713230e002d79a6f3a289bb17cef600cba385848b72/fonttools-4.58.5-py3-none-any.whl", hash = "sha256:e48a487ed24d9b611c5c4b25db1e50e69e9854ca2670e39a3486ffcd98863ec4", size = 1115318, upload_time = "2025-07-03T14:04:45.378Z" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "idna" | ||||||
|  | version = "3.10" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "kiwisolver" | name = "kiwisolver" | ||||||
| version = "1.4.8" | version = "1.4.8" | ||||||
|  | @ -163,6 +212,18 @@ wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload_time = "2025-04-19T22:47:00.147Z" }, |     { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload_time = "2025-04-19T22:47:00.147Z" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "openpyxl" | ||||||
|  | version = "3.1.5" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | dependencies = [ | ||||||
|  |     { name = "et-xmlfile" }, | ||||||
|  | ] | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload_time = "2024-06-28T14:03:44.161Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload_time = "2024-06-28T14:03:41.161Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "packaging" | name = "packaging" | ||||||
| version = "25.0" | version = "25.0" | ||||||
|  | @ -254,6 +315,15 @@ wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload_time = "2025-07-01T09:15:50.399Z" }, |     { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload_time = "2025-07-01T09:15:50.399Z" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "platformdirs" | ||||||
|  | version = "4.3.8" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload_time = "2025-05-07T22:47:42.121Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload_time = "2025-05-07T22:47:40.376Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "pydantic" | name = "pydantic" | ||||||
| version = "2.11.7" | version = "2.11.7" | ||||||
|  | @ -336,6 +406,21 @@ wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" }, |     { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "requests" | ||||||
|  | version = "2.32.4" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | dependencies = [ | ||||||
|  |     { name = "certifi" }, | ||||||
|  |     { name = "charset-normalizer" }, | ||||||
|  |     { name = "idna" }, | ||||||
|  |     { name = "urllib3" }, | ||||||
|  | ] | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload_time = "2025-06-09T16:43:07.34Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload_time = "2025-06-09T16:43:05.728Z" }, | ||||||
|  | ] | ||||||
|  | 
 | ||||||
| [[package]] | [[package]] | ||||||
| name = "seaborn" | name = "seaborn" | ||||||
| version = "0.13.2" | version = "0.13.2" | ||||||
|  | @ -365,18 +450,24 @@ version = "0.1.0" | ||||||
| source = { virtual = "." } | source = { virtual = "." } | ||||||
| dependencies = [ | dependencies = [ | ||||||
|     { name = "matplotlib" }, |     { name = "matplotlib" }, | ||||||
|  |     { name = "openpyxl" }, | ||||||
|     { name = "pandas" }, |     { name = "pandas" }, | ||||||
|  |     { name = "platformdirs" }, | ||||||
|     { name = "pydantic" }, |     { name = "pydantic" }, | ||||||
|     { name = "python-dotenv" }, |     { name = "python-dotenv" }, | ||||||
|  |     { name = "requests" }, | ||||||
|     { name = "seaborn" }, |     { name = "seaborn" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| [package.metadata] | [package.metadata] | ||||||
| requires-dist = [ | requires-dist = [ | ||||||
|     { name = "matplotlib", specifier = ">=3.10.3" }, |     { name = "matplotlib", specifier = ">=3.10.3" }, | ||||||
|  |     { name = "openpyxl", specifier = ">=3.1.5" }, | ||||||
|     { name = "pandas", specifier = ">=2.2.3" }, |     { name = "pandas", specifier = ">=2.2.3" }, | ||||||
|  |     { name = "platformdirs", specifier = ">=4.3.8" }, | ||||||
|     { name = "pydantic", specifier = ">=2.11.7" }, |     { name = "pydantic", specifier = ">=2.11.7" }, | ||||||
|     { name = "python-dotenv", specifier = ">=1.1.1" }, |     { name = "python-dotenv", specifier = ">=1.1.1" }, | ||||||
|  |     { name = "requests", specifier = ">=2.32.4" }, | ||||||
|     { name = "seaborn", specifier = ">=0.13.2" }, |     { name = "seaborn", specifier = ">=0.13.2" }, | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
|  | @ -412,3 +503,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be76 | ||||||
| wheels = [ | wheels = [ | ||||||
|     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" }, |     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" }, | ||||||
| ] | ] | ||||||
|  | 
 | ||||||
|  | [[package]] | ||||||
|  | name = "urllib3" | ||||||
|  | version = "2.5.0" | ||||||
|  | source = { registry = "https://pypi.org/simple" } | ||||||
|  | sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload_time = "2025-06-18T14:07:41.644Z" } | ||||||
|  | wheels = [ | ||||||
|  |     { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload_time = "2025-06-18T14:07:40.39Z" }, | ||||||
|  | ] | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Félix Dorn
						Félix Dorn