diff --git a/old/add_task_estimates.py b/old/add_task_estimates.py new file mode 100644 index 0000000..e72a532 --- /dev/null +++ b/old/add_task_estimates.py @@ -0,0 +1,507 @@ +import pandas as pd +import litellm +import dotenv +import os +import time +import json +import math +import numpy as np + +# --- Configuration --- +MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output +RATE_LIMIT = 5000 # Requests per minute +CHUNK_SIZE = 300 +SECONDS_PER_MINUTE = 60 +FILENAME = ( + "tasks_with_estimates.csv" # This CSV should contain the tasks to be processed +) + +# --- Prompts and Schema --- +SYSTEM_PROMPT = """ +You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision. + +Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. + +Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year. +""".strip() + +USER_MESSAGE_TEMPLATE = """ +Please estimate the time range for the following remote task: + +**Task Description:** {task} +**Relevant activies for the task:** +{dwas} + +**Occupation Category:** {occupation_title} +**Occupation Description:** {occupation_description} + +Consider the complexity and the typical steps involved. +""".strip() + +ALLOWED_UNITS = [ + "minute", + "hour", + "day", + "week", + "month", + "trimester", + "semester", + "year", +] + +SCHEMA_FOR_VALIDATION = { + "name": "estimate_time", + "strict": True, # Enforce schema adherence + "schema": { + "type": "object", + "properties": { + "lower_bound_estimate": { + "type": "object", + "properties": { + "quantity": { + "type": "number", + "description": "The numerical value for the lower bound of the estimate.", + }, + "unit": { + "type": "string", + "enum": ALLOWED_UNITS, + "description": "The unit of time for the lower bound.", + }, + }, + "required": ["quantity", "unit"], + "additionalProperties": False, + }, + "upper_bound_estimate": { + "type": "object", + "properties": { + "quantity": { + "type": "number", + "description": "The numerical value for the upper bound of the estimate.", + }, + "unit": { + "type": "string", + "enum": ALLOWED_UNITS, + "description": "The unit of time for the upper bound.", + }, + }, + "required": ["quantity", "unit"], + "additionalProperties": False, + }, + }, + "required": ["lower_bound_estimate", "upper_bound_estimate"], + "additionalProperties": False, + }, +} + + +def save_dataframe(df_to_save, filename): + + """Saves the DataFrame to the specified CSV file using atomic write.""" + try: + temp_filename = filename + ".tmp" + df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) + os.replace(temp_filename, filename) + except Exception as e: + print(f"--- Error saving DataFrame to {filename}: {e} ---") + if os.path.exists(temp_filename): + try: + os.remove(temp_filename) + except Exception as remove_err: + print( + f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" + ) + +def create_task_estimates(): + try: + # Read the CSV + if os.path.exists(FILENAME): + df = pd.read_csv(FILENAME, encoding="utf-8-sig") + print(f"Successfully read {len(df)} rows from {FILENAME}.") + + estimate_columns_spec = { + "lb_estimate_qty": float, + "lb_estimate_unit": object, + "ub_estimate_qty": float, + "ub_estimate_unit": object, + } + save_needed = False + + for col_name, target_dtype in estimate_columns_spec.items(): + if col_name not in df.columns: + # Initialize with a type-compatible missing value + if target_dtype == float: + df[col_name] = np.nan + else: # object + df[col_name] = pd.NA + df[col_name] = df[col_name].astype(target_dtype) # Enforce dtype + print(f"Added '{col_name}' column as {df[col_name].dtype}.") + save_needed = True + else: + # Column exists, ensure correct dtype + current_pd_dtype = df[col_name].dtype + expected_pd_dtype = pd.Series(dtype=target_dtype).dtype + + if current_pd_dtype != expected_pd_dtype: + try: + if target_dtype == float: + df[col_name] = pd.to_numeric(df[col_name], errors="coerce") + else: # object + df[col_name] = df[col_name].astype(object) + print( + f"Corrected dtype of '{col_name}' to {df[col_name].dtype}." + ) + save_needed = True + except Exception as e: + print( + f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}" + ) + + # Standardize missing values (e.g., empty strings to NA/NaN) + # Replace common missing placeholders with pd.NA first + df[col_name].replace(["", None, ""], pd.NA, inplace=True) + if target_dtype == float: + # For float columns, ensure they are numeric and use np.nan after replacement + df[col_name] = pd.to_numeric(df[col_name], errors="coerce") + + if save_needed: + print(f"Saving {FILENAME} after adding/adjusting estimate columns.") + save_dataframe(df, FILENAME) + else: + print( + f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." + ) + exit() + except FileNotFoundError: + print( + f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." + ) + exit() + except Exception as e: + print(f"Error reading or initializing {FILENAME}: {e}") + exit() + + # --- Identify Rows to Process --- + # We'll check for NaN in one of the primary quantity columns. + unprocessed_mask = df["lb_estimate_qty"].isna() + if unprocessed_mask.any(): + start_index = unprocessed_mask.idxmax() # Finds the index of the first True value + print(f"Resuming processing. First unprocessed row found at index {start_index}.") + df_to_process = df.loc[unprocessed_mask].copy() + original_indices = df_to_process.index # Keep track of original indices + else: + print( + "All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting." + ) + exit() + + + # --- Prepare messages for batch completion (only for rows needing processing) --- + messages_list = [] + skipped_rows_indices = [] + valid_original_indices = [] + + if not df_to_process.empty: + required_cols = ["task", "occupation_title", "occupation_description", "dwas"] + print( + f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..." + ) + print(f"Checking for required columns: {required_cols}") + + for index, row in df_to_process.iterrows(): + missing_or_empty = [] + for col in required_cols: + if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "": + missing_or_empty.append(col) + + if missing_or_empty: + print( + f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}." + ) + skipped_rows_indices.append(index) + continue + + try: + user_message = USER_MESSAGE_TEMPLATE.format( + task=row["task"], + occupation_title=row["occupation_title"], + occupation_description=row["occupation_description"], + dwas=row["dwas"], + ) + except KeyError as e: + print( + f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns." + ) + skipped_rows_indices.append(index) + continue + + messages_for_row = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_message}, + ] + messages_list.append(messages_for_row) + valid_original_indices.append(index) # This is the original DataFrame index + + print( + f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)." + ) + if not messages_list: + print("No valid rows found to process after checking required data. Exiting.") + exit() + else: + print( + "No rows found needing processing (df_to_process is empty)." + ) # Should have been caught by earlier check + exit() + + + # --- Call batch_completion in chunks with rate limiting and periodic saving --- + total_messages_to_send = len(messages_list) + num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE) + + print( + f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..." + ) + + overall_start_time = time.time() + processed_count_total = 0 + + for i in range(num_chunks): + chunk_start_message_index = i * CHUNK_SIZE + chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send) + message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] + # Get corresponding original DataFrame indices for this chunk + chunk_original_indices = valid_original_indices[ + chunk_start_message_index:chunk_end_message_index + ] + + if not message_chunk: + continue + + min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A" + max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A" + print( + f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." + f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}" + ) + chunk_start_time = time.time() + responses = [] + try: + print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...") + responses = litellm.batch_completion( + model=MODEL, + messages=message_chunk, + response_format={ + "type": "json_schema", + "json_schema": SCHEMA_FOR_VALIDATION, + }, + num_retries=3, + # request_timeout=60 # Optional: uncomment if needed + ) + print(f"Chunk {i + 1} API call completed.") + + except Exception as e: + print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") + responses = [None] * len( + message_chunk + ) # Ensure responses list matches message_chunk length for processing loop + + # --- Process responses for the current chunk --- + chunk_updates = {} # To store {original_df_index: {qty/unit data}} + successful_in_chunk = 0 + failed_in_chunk = 0 + + if responses and len(responses) == len(message_chunk): + for j, response in enumerate(responses): + original_df_index = chunk_original_indices[j] + + # Initialize values for this item + lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None + content_str = None + + if response is None: + print( + f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)." + ) + failed_in_chunk += 1 + continue + + try: + if ( + response.choices + and response.choices[0].message + and response.choices[0].message.content + ): + content_str = response.choices[0].message.content + estimate_data = json.loads(content_str) # Can raise JSONDecodeError + + lower_bound_dict = estimate_data.get("lower_bound_estimate") + upper_bound_dict = estimate_data.get("upper_bound_estimate") + + valid_response_structure = isinstance( + lower_bound_dict, dict + ) and isinstance(upper_bound_dict, dict) + + if valid_response_structure: + lb_qty_raw = lower_bound_dict.get("quantity") + lb_unit_raw = lower_bound_dict.get("unit") + ub_qty_raw = upper_bound_dict.get("quantity") + ub_unit_raw = upper_bound_dict.get("unit") + + is_valid_item = True + # Validate LB Qty + if ( + not isinstance(lb_qty_raw, (int, float)) + or math.isnan(float(lb_qty_raw)) + or float(lb_qty_raw) < 0 + ): + print( + f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}" + ) + is_valid_item = False + else: + lb_qty_val = float(lb_qty_raw) + + # Validate UB Qty + if ( + not isinstance(ub_qty_raw, (int, float)) + or math.isnan(float(ub_qty_raw)) + or float(ub_qty_raw) < 0 + ): + print( + f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}" + ) + is_valid_item = False + else: + ub_qty_val = float(ub_qty_raw) + + # Validate Units + if lb_unit_raw not in ALLOWED_UNITS: + print( + f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'" + ) + is_valid_item = False + else: + lb_unit_val = lb_unit_raw + + if ub_unit_raw not in ALLOWED_UNITS: + print( + f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'" + ) + is_valid_item = False + else: + ub_unit_val = ub_unit_raw + + if is_valid_item: + successful_in_chunk += 1 + chunk_updates[original_df_index] = { + "lb_estimate_qty": lb_qty_val, + "lb_estimate_unit": lb_unit_val, + "ub_estimate_qty": ub_qty_val, + "ub_estimate_unit": ub_unit_val, + } + else: + failed_in_chunk += ( + 1 # Values remain None if not fully valid + ) + else: + print( + f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'" + ) + failed_in_chunk += 1 + else: + finish_reason = ( + response.choices[0].finish_reason + if (response.choices and response.choices[0].finish_reason) + else "unknown" + ) + error_message = ( + response.choices[0].message.content + if ( + response.choices + and response.choices[0].message + and response.choices[0].message.content + ) + else "No content in message." + ) + print( + f"Warning: Received non-standard or empty response content for original index {original_df_index}. " + f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" + ) + failed_in_chunk += 1 + + except json.JSONDecodeError: + print( + f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'" + ) + failed_in_chunk += 1 + except AttributeError as ae: + print( + f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}" + ) + failed_in_chunk += 1 + except Exception as e: + print( + f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}" + ) + failed_in_chunk += 1 + else: + print( + f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) " + f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed." + ) + failed_in_chunk = len( + message_chunk + ) # All items in this chunk are considered failed if response array is problematic + + print( + f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}" + ) + processed_count_total += successful_in_chunk + + # --- Update Main DataFrame and Save Periodically --- + if chunk_updates: + print( + f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..." + ) + for idx, estimates in chunk_updates.items(): + if idx in df.index: + df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"] + df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"] + df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"] + df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"] + + print(f"Saving progress to {FILENAME}...") + save_dataframe(df, FILENAME) + else: + print(f"No successful estimates obtained in chunk {i + 1} to save.") + + # --- Rate Limiting Pause --- + chunk_end_time = time.time() + chunk_duration = chunk_end_time - chunk_start_time + print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.") + + if i < num_chunks - 1: # No pause after the last chunk + # Calculate ideal time per request based on rate limit + time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 + # Calculate minimum duration this chunk should have taken to respect rate limit + min_chunk_duration_for_rate = len(message_chunk) * time_per_request + # Calculate pause needed + pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) + + if pause_needed > 0: + print( + f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." + ) + time.sleep(pause_needed) + + overall_end_time = time.time() + total_duration_minutes = (overall_end_time - overall_start_time) / 60 + print( + f"\nBatch completion finished." + f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes." + ) + + print(f"Performing final save to {FILENAME}...") + save_dataframe(df, FILENAME) + + print("\nScript finished.") diff --git a/old/analysis.py b/old/analysis.py new file mode 100644 index 0000000..de988ad --- /dev/null +++ b/old/analysis.py @@ -0,0 +1,521 @@ +import os +import litellm +import sqlite3 +import numpy as np +import pandas as pd +from google.colab import userdata, files +import seaborn as sns +import matplotlib.pyplot as plt +import matplotlib as mpl + +os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') +os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY') + +occupation_major_codes = { + '11': 'Management', + '13': 'Business and Financial Operations', + '15': 'Computer and Mathematical Occupations', + '17': 'Architecture and Engineering', + '19': 'Life, Physical, and Social Science', + '21': 'Community and Social Services', + '23': 'Legal', + '25': 'Education, Training, and Library', + '27': 'Arts, Design, Entertainment, Sports, and Media', + '29': 'Healthcare Practitioners and Technical', + '31': 'Healthcare Support', + '33': 'Protective Service', + '35': 'Food Preparation and Serving Related', + '37': 'Building and Grounds Cleaning and Maintenance', + '39': 'Personal Care and Service', + '41': 'Sales and Related', + '43': 'Office and Administrative Support', + '45': 'Farming, Fishing, and Forestry', + '47': 'Construction and Extraction', + '49': 'Installation, Maintenance, and Repair', + '51': 'Production', + '53': 'Transportation and Material Moving', + '55': 'Military Specific' +} + +gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', + '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', + '600':'#475569','700':'#334155','800':'#1e293b', + '900':'#0f172a','950':'#020617'} +lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', + '300': '#bbf451','400': '#9ae600','500': '#83cd00', + '600': '#64a400','700': '#497d00','800': '#3c6300', + '900': '#35530e','950': '#192e03'} + +mpl.rcParams.update({ + 'figure.facecolor' : gray['50'], + 'axes.facecolor' : gray['50'], + 'axes.edgecolor' : gray['100'], + 'axes.labelcolor' : gray['700'], + 'xtick.color' : gray['700'], + 'ytick.color' : gray['700'], + 'font.family' : 'Inter', # falls back to DejaVu if Inter not present + 'font.size' : 11, +}) + +sns.set_style("white") # keep minimal axes, we will remove default grid +sns.set_context("notebook") + +def prepare_tasks(): + + # Run uv run ./enrich_task_ratings.py + df_tasks = pd.read_json("task_ratings_enriched.json") + + # Run uv run classify_estimateability_of_tasks.py + df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first') + + # df_tasks now has a remote_status column which contains either "remote" or "not remote" + df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left') + df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) + + # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT" + df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left') + + df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy() + + df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2] + + df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy() + + # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv + +def preprocessing_time_estimates(): + df = pd.read_csv("tasks_with_estimates.csv") + + df = df[df['importance_average'] > 3].copy() + + # The embeddings comes from running `uv run ./embed_task_description.py` + # Columns: ['embedding_id', 'task', 'embedding_vector'] + # These contain embedding for UNIQUE tasks + df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy() + + df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left') + df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left') + + df['onetsoc_major'] = df['onetsoc_code'].str[:2] + + def convert_to_minutes(qty, unit): + """Converts a quantity in a given unit to minutes.""" + return qty * { + "minute": 1, + "hour": 60, + "day": 60 * 24, + "week": 60 * 24 * 7, + "month": 60 * 24 * 30, + "trimester": 60 * 24 * 90, + "semester": 60 * 24 * 180, + "year": 60 * 24 * 365, + }[unit] + + df['lb_estimate_in_minutes'] = df.apply( + lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1 + ) + df['ub_estimate_in_minutes'] = df.apply( + lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1 + ) + + df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes + df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes + df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2 + + atomic_tasks = df[df['estimateable'] == 'ATOMIC'] + ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT'] + + with pd.option_context('display.max_columns', None): + display(df) + + # Check for empty estimates + if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0: + print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum()) + + if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0: + print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum()) + + # Check for impossible bounds + impossible_bounds = atomic_tasks[ + (atomic_tasks['lb_estimate_in_minutes'] <= 0) | + (atomic_tasks['ub_estimate_in_minutes'] <= 0) | + (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes']) + ] + if not impossible_bounds.empty: + print(f"Error: Found rows with impossible bounds.") + with pd.option_context('display.max_colwidth', None): + display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']]) + + #with pd.option_context('display.max_colwidth', None): + #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']]) + +def cell1(): + sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True) + +def cell2(): + plt.figure(figsize=(14,10)) + sns.boxplot( + data=atomic_tasks, + x='onetsoc_major', # 11 = Management, 15 = Computer/Math, … + y='estimate_range', + showfliers=False + ) + plt.yscale('log') # long tail => log scale + plt.xlabel('Occupation') + plt.ylabel('Range (upper-lower, minutes)') + plt.title('Spread of time-range estimates per occupation') + + ax = plt.gca() + ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right') + +def cell3(): + plt.figure(figsize=(10, 10)) + ax = sns.scatterplot( + data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels + x='lb_estimate_in_minutes', y='ub_estimate_in_minutes', + alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue + ) + + # 45° reference + lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max()) + ax.plot(lims, lims, color='black', linestyle='--', linewidth=1) + + # optional helper lines: 2× and 10×, 100× ratios + for k in [2,10, 100]: + ax.plot(lims, [k*l for l in lims], + linestyle=':', color='grey', linewidth=1) + + ax.set(xscale='log', yscale='log') + ax.set_xlabel('Lower-bound (min, log scale)') + ax.set_ylabel('Upper-bound (min, log scale)') + ax.set_title('Lower vs upper estimates for all tasks') + + # Place the legend outside the plot + ax.legend(bbox_to_anchor=(1, 1), loc='upper left') + +def cell4(): + plt.figure(figsize=(8,4)) + sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()), + bins=60, kde=True) + plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×') + plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×') + plt.axvline(0, color='black', ls='-', lw=1) # ub = lb + plt.xlabel('log₁₀(upper / lower)') + plt.ylabel('Count') + plt.title('Distribution of upper:lower ratio') + plt.legend() + plt.tight_layout() + + +def cell5(): + # 1. Bin lower bounds into quartiles (Q1–Q4) + atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes, + q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest']) + + + # 3. Aggregate: median (or mean) ratio per cell + pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q', + values='estimate_ratio', aggfunc='median') + + # Map the index (onetsoc_major codes) to their corresponding labels + pivot.index = pivot.index.map(occupation_major_codes) + + + # 4. Visualise + plt.figure(figsize=(10,8)) + sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f', + cbar_kws={'label':'Median upper/lower ratio'}) + plt.xlabel('Lower-bound quartile') + plt.ylabel('Occupation (major group)') + plt.title('Typical range width by occupation and task length') + plt.tight_layout() + + + +def cell6(): + """ + from scipy.stats import median_abs_deviation + + def mad_z(series): + med = series.median() + mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ + return (series - med) / mad + + df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) + """ + + agg = (atomic_tasks + .groupby('onetsoc_code')['estimate_midpoint'] + .agg(median='median', + q1=lambda x: x.quantile(.25), + q3=lambda x: x.quantile(.75), + mean='mean', + std='std') + .reset_index()) + agg['IQR'] = agg.q3 - agg.q1 + agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation + + # merge back the group mean and std so each row can be scored + atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code') + + + atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std'] + outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3] + outliers + +def cell7(): + from scipy.stats import median_abs_deviation + + def mad_z(series): + med = series.median() + mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ + return (series - med) / mad + + atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) + +def cell10(): + import matplotlib.ticker as mtick # For percentage formatting + import matplotlib.colors as mcolors # For color conversion + + summary_data = [] + + for code, label in occupation_major_codes.items(): + occ_df = df_tasks[df_tasks['onetsoc_major'] == code] + total_tasks_in_occ = len(occ_df) + + if total_tasks_in_occ == 0: + continue # Skip if no tasks for this occupation + + # Stack 1: % that isn't equal to "remote" + not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) + + # For the remaining remote tasks: + remote_df = occ_df[occ_df['remote_status'] == 'remote'] + + # Stack 2: % of remote + ATOMIC + remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC']) + + # Stack 3: % of remote + ONGOING-CONSTRAINT + remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT']) + + summary_data.append({ + 'onetsoc_major_code': code, + 'occupation_label': label, + 'count_not_remote': not_remote_count, + 'count_remote_atomic': remote_atomic_count, + 'count_remote_ongoing': remote_ongoing_count, + 'total_tasks': total_tasks_in_occ + }) + + summary_df = pd.DataFrame(summary_data) + + # --- 3. Calculate Percentages --- + # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks + summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning + + summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100 + summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100 + summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100 + + # Select columns for plotting and set index to occupation label + plot_df = summary_df.set_index('occupation_label')[ + ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing'] + ] + + # Rename columns for a clearer legend + plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable'] + + plot_df = plot_df.sort_values(by='Not Remote', ascending=False) + + + # --- 4. Plotting (Modified) --- + + # Define the custom colors based on your requirements + # The order must match the column order in plot_df: + # 1. 'Not Remote' + # 2. 'Remote & ATOMIC' + # 3. 'Remote & ONGOING-CONSTRAINT' + bar_colors = [gray["300"], lime["500"], lime["200"]] + + fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability + + plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors) + + ax.set_xlabel("Percentage of Tasks (%)", fontsize=12) + ax.set_ylabel("Occupation Major Group", fontsize=12) + ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20) + + # Format x-axis as percentages + ax.xaxis.set_major_formatter(mtick.PercentFormatter()) + plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100% + + # Remove right and top spines + ax.spines['right'].set_visible(False) + ax.spines['top'].set_visible(False) + + # Function to get contrasting text color + def get_contrasting_text_color(bg_color_hex_or_rgba): + """ + Determines if black or white text provides better contrast against a given background color. + bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]). + Returns: 'black' or 'white'. + """ + # Convert to RGBA if it's a hex string or name + if isinstance(bg_color_hex_or_rgba, str): + rgba = mcolors.to_rgba(bg_color_hex_or_rgba) + else: + rgba = bg_color_hex_or_rgba + + r, g, b, _ = rgba # Ignore alpha for luminance calculation + # Calculate luminance (standard formula for sRGB) + # Values r, g, b should be in [0, 1] for this formula + luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b + # Threshold for deciding text color + return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual + + # Add percentages inside each bar segment + # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.) + for i, container in enumerate(ax.containers): + # Get the color for this container/category + segment_color = bar_colors[i] + text_color = get_contrasting_text_color(segment_color) + + for patch in container.patches: # Iterate through each bar segment in the category + width = patch.get_width() + if width > 3: # Only add text if segment is wide enough (e.g., >3%) + x = patch.get_x() + width / 2 + y = patch.get_y() + patch.get_height() / 2 + ax.text(x, y, + f"{width:.1f}%", + ha='center', + va='center', + fontsize=8, # Adjust font size as needed + color=text_color, + fontweight='medium') # Bolder text can help + + + plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False) + +def cell11(): + df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2] + + # Calculate wage bill per occupation + # Wage bill = Total Employment * Annual Mean Wage + # Ensure columns are numeric, converting non-numeric values to NaN first + df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce') + df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce') + + # Drop rows with NaN in necessary columns after coercion + df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True) + + df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN'] + + # Aggregate wage bill by onetsoc_major + df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index() + + # Map major codes to titles for better plotting + df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes) + + # Sort by wage bill for better visualization + df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False) + + # Plotting + plt.figure(figsize=(12, 8)) + sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis") + plt.title('Total Wage Bill per Major Occupation Group') + plt.xlabel('Total Wage Bill (in billions)') + plt.ylabel('Major Occupation Group') + plt.grid(axis='x', linestyle='--', alpha=0.7) + +def cell11(): + # ─────────────────────────────────────────────────────────────── + # 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP + # ─────────────────────────────────────────────────────────────── + def cdf(series): + s = series.sort_values().reset_index(drop=True) + return s.values, ((s.index + 1) / len(s)) * 100 + + x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes']) + x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes']) + x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2) + + # ─────────────────────────────────────────────────────────────── + # 2. PLOTTING + # ─────────────────────────────────────────────────────────────── + fig, ax = plt.subplots(figsize=(10, 6)) + + # horizontal reference lines every 10 % + for y_val in range(0, 101, 10): + ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1) + + # Plot Lower Bound CDF + ax.step(x_lb, y_lb, + where='post', + color=lime['300'], # Example: light blue for lower bound + linewidth=1.8, + linestyle='--', + zorder=2, + label='Lower bound estimate (CDF)') + + # Plot Upper Bound CDF + ax.step(x_ub, y_ub, + where='post', + color=lime['900'], # Example: light orange/red for upper bound + linewidth=1.8, + linestyle=':', + zorder=3, + label='Upper bound estimate (CDF)') + + # Plot Midpoint CDF (plotted last to be on top, or adjust zorder) + ax.step(x_mid, y_mid, + where='post', + color=lime['600'], + linewidth=2.2, + zorder=4, # Ensure it's on top of other lines if they overlap significantly + label='Mid-point estimate (CDF)') + + + # axes limits / scales + ax.set_ylim(0, 100) + ax.set_xscale('log') + + # y-axis ➝ percent labels + ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0)) + + + # move y-label to top-left (just inside plotting area) + ax.text(-0.06, 1.03, + "% of tasks with temporal coherence ≤ X", + ha='left', va='bottom', + transform=ax.transAxes, + fontsize=12, fontweight='semibold') + + # custom x-ticks at human-friendly durations + ticks = [1, 5, 10, 30, 60, 120, 240, 480, + 1440, 2880, 10080, 43200, 129600, + 259200, 525600] + ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours', + '1 day', '2 days', '1 week', '30 days', + '90 days', '180 days', '1 year'] + + # Vertical reference lines for x-ticks + for tick in ticks: + ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1) + + ax.set_xticks(ticks) + ax.set_xticklabels(ticklabels, rotation=45, ha='right') + + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_edgecolor(gray['300']) + ax.spines['bottom'].set_edgecolor(gray['300']) + + + # legend + ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed + + ax.text(0.5, -0.3, + 'Temporal coherence (X)', + ha='center', va='center', + transform=ax.transAxes, + fontsize=12, fontweight='semibold') diff --git a/old/classify_estimateability_of_tasks.py b/old/classify_estimateability_of_tasks.py new file mode 100644 index 0000000..ccf110b --- /dev/null +++ b/old/classify_estimateability_of_tasks.py @@ -0,0 +1,411 @@ +import pandas as pd +import litellm +import dotenv +import os +import time +import json +import math + +# Load environment variables +dotenv.load_dotenv(override=True) + +# litellm._turn_on_debug() # Optional debugging + +# --- Configuration --- +MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output +RATE_LIMIT = 5000 # Requests per minute +CHUNK_SIZE = 300 # Number of unique tasks per API call +SECONDS_PER_MINUTE = 60 + +# File configuration +CLASSIFICATION_FILENAME = "tasks_estimateable.csv" # Output file with classifications +TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv" +OUTPUT_COLUMN_NAME = "task_estimateable" +SOURCE_FILTER_COLUMN = "remote_status" +SOURCE_FILTER_VALUE = "remote" + +# --- Prompts and Schema --- +SYSTEM_PROMPT_CLASSIFY = """ +Classify the provided O*NET task into one of these categories: + - ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. + - ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). +""".strip() + +USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}" + +CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"] + +SCHEMA_FOR_CLASSIFICATION = { + "name": "classify_task_type", + "strict": True, + "schema": { + "type": "object", + "properties": { + "task_category": { + "type": "string", + "enum": CLASSIFICATION_CATEGORIES, + "description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).", + } + }, + "required": ["task_category"], + "additionalProperties": False, + }, +} + + +def save_dataframe(df_to_save, filename): + """Saves the DataFrame to the specified CSV file using atomic write.""" + try: + temp_filename = filename + ".tmp" + df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) + os.replace(temp_filename, filename) + except Exception as e: + print(f"--- Error saving DataFrame to {filename}: {e} ---") + if os.path.exists(temp_filename): + try: + os.remove(temp_filename) + except Exception as remove_err: + print( + f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" + ) + + +# --- Load or Initialize DataFrame --- +try: + if os.path.exists(CLASSIFICATION_FILENAME): + df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig") + print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.") + + save_needed_after_load = False + if OUTPUT_COLUMN_NAME not in df.columns: + df[OUTPUT_COLUMN_NAME] = pd.NA + print(f"Added '{OUTPUT_COLUMN_NAME}' column.") + save_needed_after_load = True + + df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True) + + if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance( + df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype + ): + try: + df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) + print( + f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}." + ) + save_needed_after_load = True + except Exception as e: + print( + f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}." + ) + + if "task" not in df.columns: + print( + f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing." + ) + exit() + + if save_needed_after_load: + print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.") + save_dataframe(df, CLASSIFICATION_FILENAME) + else: + print( + f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}." + ) + if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME): + print( + f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}." + ) + exit() + + df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig") + + required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN] + missing_source_cols = [ + col for col in required_source_cols_for_init if col not in df_source.columns + ] + if missing_source_cols: + print( + f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}." + ) + exit() + + df_source_filtered = df_source[ + df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE + ].copy() + + if df_source_filtered.empty: + print( + f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. " + f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially." + ) + + df = df_source_filtered[["task"]].copy() + df[OUTPUT_COLUMN_NAME] = pd.NA + df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) + + print( + f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} " + f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks." + ) + save_dataframe(df, CLASSIFICATION_FILENAME) + +except FileNotFoundError: + print(f"Error: A required file was not found. Please check paths.") + exit() +except Exception as e: + print(f"Error during DataFrame loading or initialization: {e}") + exit() + + +# --- Identify Unique Tasks to Process --- +if df.empty: + print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.") + exit() + +initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna() + +if not initial_unprocessed_mask.any(): + print( + f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting." + ) + exit() + +# Filter for rows that are unprocessed AND have a valid 'task' string +valid_tasks_to_consider_df = df[ + initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "") +] + +if valid_tasks_to_consider_df.empty: + print( + f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting." + ) + exit() + +unique_task_labels_for_api = ( + valid_tasks_to_consider_df["task"].drop_duplicates().tolist() +) +total_rows_to_update_potentially = len( + df[initial_unprocessed_mask] +) # Count all rows that are NA + +print( + f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification." +) +print( + f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API." +) + + +# --- Prepare messages for batch completion (only for unique task labels) --- +messages_list = [] +print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...") + +for task_label in unique_task_labels_for_api: + # task_label is already guaranteed to be non-empty and not NaN from the filtering above + user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label) + messages_for_task = [ + {"role": "system", "content": SYSTEM_PROMPT_CLASSIFY}, + {"role": "user", "content": user_message}, + ] + messages_list.append(messages_for_task) + +print(f"Prepared {len(messages_list)} message sets for batch completion.") +if ( + not messages_list +): # Should only happen if unique_task_labels_for_api was empty, caught above + print( + "No messages prepared, though unique tasks were identified. This is unexpected. Exiting." + ) + exit() + + +# --- Call batch_completion in chunks with rate limiting and periodic saving --- +total_unique_tasks_to_send = len( + messages_list +) # Same as len(unique_task_labels_for_api) +num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE) + +print( + f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..." +) + +overall_start_time = time.time() +processed_rows_count_total = 0 # Counts actual rows updated in the DataFrame + +for i in range(num_chunks): + chunk_start_message_index = i * CHUNK_SIZE + chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send) + + message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] + # Get corresponding unique task labels for this chunk + chunk_task_labels = unique_task_labels_for_api[ + chunk_start_message_index:chunk_end_message_index + ] + + if not message_chunk: # Should not happen if loop range is correct + continue + + print( + f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." + ) + chunk_start_time = time.time() + responses = [] + try: + print( + f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..." + ) + responses = litellm.batch_completion( + model=MODEL, + messages=message_chunk, + response_format={ + "type": "json_schema", + "json_schema": SCHEMA_FOR_CLASSIFICATION, + }, + num_retries=3, + ) + print(f"Chunk {i + 1} API call completed.") + + except Exception as e: + print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") + responses = [None] * len(message_chunk) + + # --- Process responses for the current chunk --- + # chunk_updates stores {task_label: classification_category} + chunk_task_classifications = {} + successful_api_calls_in_chunk = 0 + failed_api_calls_in_chunk = 0 + + if responses and len(responses) == len(message_chunk): + for j, response in enumerate(responses): + current_task_label = chunk_task_labels[ + j + ] # The unique task label for this response + content_str = None + + if response is None: + print( + f"API call failed for task label '{current_task_label}' (response is None)." + ) + failed_api_calls_in_chunk += 1 + continue + + try: + if ( + response.choices + and response.choices[0].message + and response.choices[0].message.content + ): + content_str = response.choices[0].message.content + classification_data = json.loads(content_str) + category_raw = classification_data.get("task_category") + + if category_raw in CLASSIFICATION_CATEGORIES: + successful_api_calls_in_chunk += 1 + chunk_task_classifications[current_task_label] = category_raw + else: + print( + f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'" + ) + failed_api_calls_in_chunk += 1 + else: + finish_reason = ( + response.choices[0].finish_reason + if (response.choices and response.choices[0].finish_reason) + else "unknown" + ) + error_message = ( + response.choices[0].message.content + if (response.choices and response.choices[0].message) + else "No content in message." + ) + print( + f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. " + f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" + ) + failed_api_calls_in_chunk += 1 + + except json.JSONDecodeError: + print( + f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'" + ) + failed_api_calls_in_chunk += 1 + except AttributeError as ae: + print( + f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}" + ) + failed_api_calls_in_chunk += 1 + except Exception as e: + print( + f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}" + ) + failed_api_calls_in_chunk += 1 + else: + print( + f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) " + f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed." + ) + failed_api_calls_in_chunk = len(message_chunk) + + # --- Update Main DataFrame and Save Periodically --- + rows_updated_this_chunk = 0 + if chunk_task_classifications: + print( + f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..." + ) + for task_label, category in chunk_task_classifications.items(): + # Update all rows in the main df that match this task_label AND are still NA in the output column + update_condition = (df["task"] == task_label) & ( + df[OUTPUT_COLUMN_NAME].isna() + ) + num_rows_for_this_task_label = df[update_condition].shape[0] + + if num_rows_for_this_task_label > 0: + df.loc[update_condition, OUTPUT_COLUMN_NAME] = category + rows_updated_this_chunk += num_rows_for_this_task_label + + print( + f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses." + ) + print(f"Saving progress to {CLASSIFICATION_FILENAME}...") + save_dataframe(df, CLASSIFICATION_FILENAME) + else: + print( + f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save." + ) + + print( + f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. " + f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}" + ) + processed_rows_count_total += rows_updated_this_chunk + + # --- Rate Limiting Pause --- + chunk_end_time = time.time() + chunk_duration = chunk_end_time - chunk_start_time + print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.") + + if i < num_chunks - 1: + time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 + min_chunk_duration_for_rate = ( + len(message_chunk) * time_per_request + ) # Based on API calls made + pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) + + if pause_needed > 0: + print( + f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." + ) + time.sleep(pause_needed) + +overall_end_time = time.time() +total_duration_minutes = (overall_end_time - overall_start_time) / 60 +print( + f"\nBatch classification finished." + f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run." + f" Total duration: {total_duration_minutes:.2f} minutes." +) + +print(f"Performing final save to {CLASSIFICATION_FILENAME}...") +save_dataframe(df, CLASSIFICATION_FILENAME) + +print("\nScript finished.") diff --git a/old/create_onet_database.sh b/old/create_onet_database.sh new file mode 100755 index 0000000..ca5ac09 --- /dev/null +++ b/old/create_onet_database.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +# Set database name and directories +ONET_DB_NAME="onet.database" +ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" +ONET_ZIP_FILE="db_29_1_mysql.zip" +ONET_EXTRACT_DIR="db_29_1_mysql" + +# Download O*NET database only if not already downloaded +if [ ! -f "$ONET_ZIP_FILE" ]; then + echo "Downloading O*NET database from $ONET_ZIP_URL" + curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL" + + if [ $? -ne 0 ]; then + echo "Failed to download O*NET database" + exit 1 + fi +else + echo "Using existing O*NET database zip file" +fi + +# Extract downloaded zip file only if extraction directory doesn't exist +if [ ! -d "$ONET_EXTRACT_DIR" ]; then + echo "Extracting O*NET database files" + unzip -o "$ONET_ZIP_FILE" + + if [ $? -ne 0 ]; then + echo "Failed to extract O*NET database files" + exit 1 + fi +else + echo "Using existing extracted O*NET database files" +fi + +# Remove existing database if it exists +if [ -f "$ONET_DB_NAME" ]; then + echo "Removing existing database" + rm "$ONET_DB_NAME" +fi + +# Create a new SQLite database with optimized settings for fast import +echo "Creating new SQLite database: $ONET_DB_NAME with performance settings" +sqlite3 "$ONET_DB_NAME" << EOF +PRAGMA journal_mode = OFF; +PRAGMA synchronous = 0; +PRAGMA cache_size = 1000000; +PRAGMA locking_mode = EXCLUSIVE; +PRAGMA temp_store = MEMORY; +PRAGMA foreign_keys = ON; +EOF + +# Combine and execute all SQL files in one transaction +echo "Executing SQL files in alphabetical order (single transaction mode)" +sqlite3 "$ONET_DB_NAME" << EOF +BEGIN TRANSACTION; +$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat) +COMMIT; +EOF + +# Check if the execution was successful +if [ $? -ne 0 ]; then + echo "Error executing SQL files in batch transaction" + exit 1 +else + echo "Database populated successfully. Restoring reliability settings..." + + # Restore reliability-focused settings after import + sqlite3 "$ONET_DB_NAME" << EOF +PRAGMA journal_mode = WAL; +PRAGMA synchronous = NORMAL; +PRAGMA locking_mode = NORMAL; +PRAGMA temp_store = DEFAULT; +PRAGMA foreign_keys = ON; +PRAGMA optimize; +VACUUM; +EOF + + if [ $? -ne 0 ]; then + echo "Warning: Failed to restore reliability settings, but database is populated" + else + echo "Reliability settings restored successfully" + fi + + echo "O*NET database created and optimized successfully!" +fi diff --git a/old/enrich_task_ratings.py b/old/enrich_task_ratings.py new file mode 100644 index 0000000..70ae0bf --- /dev/null +++ b/old/enrich_task_ratings.py @@ -0,0 +1,392 @@ +import sqlite3 +import pandas as pd +import json +import os +from collections import defaultdict +import numpy as np + +# --- Configuration --- +DB_FILE = "onet.database" +OUTPUT_FILE = "task_ratings_enriched.json" # Changed output filename + +# --- Database Interaction --- + + +def fetch_data_from_db(db_path): + """ + Fetches required data from the O*NET SQLite database using JOINs, + including DWAs. + + Args: + db_path (str): Path to the SQLite database file. + + Returns: + tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing: + - DataFrame with task ratings info. + - DataFrame with task-to-DWA mapping. + Returns (None, None) if the database file doesn't exist or an error occurs. + """ + if not os.path.exists(db_path): + print(f"Error: Database file not found at {db_path}") + return None, None + + try: + conn = sqlite3.connect(db_path) + # Construct the SQL query to join the tables and select necessary columns + # Added LEFT JOINs for tasks_to_dwas and dwa_reference + # Use LEFT JOIN in case a task has no DWAs + query = """ + SELECT + tr.onetsoc_code, + tr.task_id, + ts.task, + od.title AS occupation_title, + od.description AS occupation_description, + tr.scale_id, + tr.category, + tr.data_value, + dr.dwa_title -- Added DWA title + FROM + task_ratings tr + JOIN + task_statements ts ON tr.task_id = ts.task_id + JOIN + occupation_data od ON tr.onetsoc_code = od.onetsoc_code + LEFT JOIN + tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id -- + LEFT JOIN + dwa_reference dr ON td.dwa_id = dr.dwa_id; -- + """ + df = pd.read_sql_query(query, conn) + conn.close() + print( + f"Successfully fetched {len(df)} records (including DWA info) from the database." + ) + + if df.empty: + print("Warning: Fetched DataFrame is empty.") + # Return empty DataFrames with expected columns if the main fetch is empty + ratings_cols = [ + "onetsoc_code", + "task_id", + "task", + "occupation_title", + "occupation_description", + "scale_id", + "category", + "data_value", + ] + dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] + return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols) + + # Remove duplicates caused by joining ratings with potentially multiple DWAs per task + # Keep only unique combinations of the core task/rating info before processing + core_cols = [ + "onetsoc_code", + "task_id", + "task", + "occupation_title", + "occupation_description", + "scale_id", + "category", + "data_value", + ] + # Check if all core columns exist before attempting to drop duplicates + missing_core_cols = [col for col in core_cols if col not in df.columns] + if missing_core_cols: + print(f"Error: Missing core columns in fetched data: {missing_core_cols}") + return None, None + ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True) + + # Get unique DWA info separately + dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] + # Check if all DWA columns exist before processing + if all(col in df.columns for col in dwa_cols): + dwas_df = ( + df[dwa_cols] + .dropna(subset=["dwa_title"]) + .drop_duplicates() + .reset_index(drop=True) + ) + else: + print("Warning: DWA related columns missing, creating empty DWA DataFrame.") + dwas_df = pd.DataFrame( + columns=dwa_cols + ) # Create empty df if columns missing + + return ratings_df, dwas_df # Return two dataframes now + + except sqlite3.Error as e: + print(f"SQLite error: {e}") + if "conn" in locals() and conn: + conn.close() + return None, None # Return None for both if error + except Exception as e: + print(f"An error occurred during data fetching: {e}") + if "conn" in locals() and conn: + conn.close() + return None, None # Return None for both if error + + +# --- Data Processing --- + + +def process_task_ratings_with_dwas(ratings_df, dwas_df): + """ + Processes the fetched data to group, pivot frequency, calculate averages, + structure the output, and add associated DWAs. + + Args: + ratings_df (pandas.DataFrame): The input DataFrame with task ratings info. + dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty. + + Returns: + list: A list of dictionaries, each representing an enriched task rating with DWAs. + Returns None if the input ratings DataFrame is invalid. + """ + if ratings_df is None or not isinstance( + ratings_df, pd.DataFrame + ): # Check if it's a DataFrame + print("Error: Input ratings DataFrame is invalid.") + return None + if ratings_df.empty: + print( + "Warning: Input ratings DataFrame is empty. Processing will yield empty result." + ) + # Decide how to handle empty input, maybe return empty list directly + # return [] + + # Ensure dwas_df is a DataFrame, even if empty + if dwas_df is None or not isinstance(dwas_df, pd.DataFrame): + print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.") + dwas_df = pd.DataFrame( + columns=["onetsoc_code", "task_id", "dwa_title"] + ) # Ensure it's an empty DF + + print("Starting data processing...") + + # --- 1. Handle Frequency (FT) --- + freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() + if not freq_df.empty: + freq_pivot = freq_df.pivot_table( + index=["onetsoc_code", "task_id"], + columns="category", + values="data_value", + fill_value=0, + ) + freq_pivot.columns = [ + f"frequency_category_{int(col)}" for col in freq_pivot.columns + ] + print(f"Processed Frequency data. Shape: {freq_pivot.shape}") + else: + print("No Frequency (FT) data found.") + # Create an empty DataFrame with the multi-index to allow merging later + idx = pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] + ) + freq_pivot = pd.DataFrame(index=idx) + + # --- 2. Handle Importance (IM, IJ) --- + imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() + if not imp_df.empty: + imp_avg = ( + imp_df.groupby(["onetsoc_code", "task_id"])["data_value"] + .mean() + .reset_index() + ) + imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) + print(f"Processed Importance data. Shape: {imp_avg.shape}") + else: + print("No Importance (IM, IJ) data found.") + imp_avg = pd.DataFrame( + columns=["onetsoc_code", "task_id", "importance_average"] + ) + + # --- 3. Handle Relevance (RT) --- + rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() + if not rel_df.empty: + rel_avg = ( + rel_df.groupby(["onetsoc_code", "task_id"])["data_value"] + .mean() + .reset_index() + ) + rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) + print(f"Processed Relevance data. Shape: {rel_avg.shape}") + else: + print("No Relevance (RT) data found.") + rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"]) + + # --- 4. Process DWAs --- + if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns: + print("Processing DWA data...") + # Group DWAs by task_id and aggregate titles into a list + dwas_grouped = ( + dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"] + .apply(list) + .reset_index() + ) # + dwas_grouped.rename( + columns={"dwa_title": "dwas"}, inplace=True + ) # Rename column to 'dwas' + print(f"Processed DWA data. Shape: {dwas_grouped.shape}") + else: + print("No valid DWA data found or provided for processing.") + dwas_grouped = None # Set to None if no DWAs + + # --- 5. Get Base Task/Occupation Info --- + base_cols = [ + "onetsoc_code", + "task_id", + "task", + "occupation_title", + "occupation_description", + ] + # Check if base columns exist in ratings_df + missing_base_cols = [col for col in base_cols if col not in ratings_df.columns] + if missing_base_cols: + print( + f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed." + ) + return None + if not ratings_df.empty: + base_info = ( + ratings_df[base_cols] + .drop_duplicates() + .set_index(["onetsoc_code", "task_id"]) + ) + print(f"Extracted base info. Shape: {base_info.shape}") + else: + print("Cannot extract base info from empty ratings DataFrame.") + # Create an empty df with index to avoid errors later if possible + idx = pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] + ) + base_info = pd.DataFrame( + index=idx, + columns=[ + col for col in base_cols if col not in ["onetsoc_code", "task_id"] + ], + ) + + # --- 6. Merge Processed Data --- + print("Merging processed data...") + # Start with base_info, which should have the index ['onetsoc_code', 'task_id'] + final_df = base_info.merge( + freq_pivot, left_index=True, right_index=True, how="left" + ) + # Reset index before merging non-indexed dfs + final_df = final_df.reset_index() + + # Merge averages - check if they are not empty before merging + if not imp_avg.empty: + final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") + else: + final_df["importance_average"] = np.nan # Add column if imp_avg was empty + + if not rel_avg.empty: + final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") + else: + final_df["relevance_average"] = np.nan # Add column if rel_avg was empty + + # Merge DWAs if available + if dwas_grouped is not None and not dwas_grouped.empty: + final_df = final_df.merge( + dwas_grouped, on=["onetsoc_code", "task_id"], how="left" + ) # Merge the dwas list + # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists + # Check if 'dwas' column exists before applying function + if "dwas" in final_df.columns: + final_df["dwas"] = final_df["dwas"].apply( + lambda x: x if isinstance(x, list) else [] + ) # Ensure tasks without DWAs get [] + else: + print("Warning: 'dwas' column not created during merge.") + final_df["dwas"] = [ + [] for _ in range(len(final_df)) + ] # Add empty list column + + else: + # Add an empty 'dwas' column if no DWA data was processed or merged + final_df["dwas"] = [[] for _ in range(len(final_df))] + + print(f"Final merged data shape: {final_df.shape}") + + # Convert DataFrame to list of dictionaries for JSON output + # Handle potential NaN values during JSON conversion + # Replace numpy NaN with Python None for JSON compatibility + final_df = final_df.replace({np.nan: None}) + result_list = final_df.to_dict(orient="records") + + return result_list + + +# --- Output --- + + +def write_to_json(data, output_path): + """ + Writes the processed data to a JSON file. + + Args: + data (list): The list of dictionaries to write. + output_path (str): Path to the output JSON file. + """ + if data is None: + print("No data to write to JSON.") + return + if not isinstance(data, list): + print( + f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON." + ) + return + + # Create directory if it doesn't exist + output_dir = os.path.dirname(output_path) + if output_dir and not os.path.exists(output_dir): + try: + os.makedirs(output_dir) + print(f"Created output directory: {output_dir}") + except OSError as e: + print(f"Error creating output directory {output_dir}: {e}") + return # Exit if cannot create directory + + try: + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=4, ensure_ascii=False) + print(f"Successfully wrote enriched data to {output_path}") + except IOError as e: + print(f"Error writing JSON file to {output_path}: {e}") + except TypeError as e: + print(f"Error during JSON serialization: {e}. Check data types.") + except Exception as e: + print(f"An unexpected error occurred during JSON writing: {e}") + + +# --- Main Execution --- + +if __name__ == "__main__": + print("Starting O*NET Task Ratings & DWAs Enrichment Script...") + # 1. Fetch data + ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE) # Fetch both datasets + + # 2. Process data + # Proceed only if ratings_data_df is a valid DataFrame (even if empty) + # dwas_data_df can be None or empty, handled inside process function + if isinstance(ratings_data_df, pd.DataFrame): + enriched_data = process_task_ratings_with_dwas( + ratings_data_df, dwas_data_df + ) # Pass both dataframes + + # 3. Write output + if ( + enriched_data is not None + ): # Check if processing returned data (even an empty list is valid) + write_to_json(enriched_data, OUTPUT_FILE) + else: + print("Data processing failed or returned None. No output file generated.") + else: + print( + "Data fetching failed or returned invalid type for ratings data. Script terminated." + ) + + print("Script finished.") diff --git a/pipeline/enrichments.py b/pipeline/enrichments.py index 85c00c6..1ef8fdf 100644 --- a/pipeline/enrichments.py +++ b/pipeline/enrichments.py @@ -7,6 +7,7 @@ from .run import Run import pandas as pd def enrich_with_task_estimateability(run: Run) -> pd.DataFrame: + run.metadata. raise NotImplementedError def enrich_with_task_estimates(run: Run) -> pd.DataFrame: diff --git a/pipeline/fetchers.py b/pipeline/fetchers.py index e5e5158..3b98451 100644 --- a/pipeline/fetchers.py +++ b/pipeline/fetchers.py @@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro import sqlite3 from typing import Tuple import pandas as pd -from .metadata import Metadata +import requests +import hashlib +import io +import zipfile +from .run import Run +from .logger import logger -def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]: - raise NotImplementedError +def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: + """ + Downloads the O*NET database, creates a local SQLite file from it, and returns a connection. + The version is the sha256 of the downloaded zip file. + """ + url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" + logger.info(f"Downloading O*NET database from {url}") + response = requests.get(url, stream=True) + response.raise_for_status() -def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]: - raise NotImplementedError + # Read content into memory + zip_content = response.content + version = hashlib.sha256(zip_content).hexdigest() + logger.info(f"O*NET database version (sha256): {version}") -def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]: - raise NotImplementedError + db_path = run.cache_dir / f"onet_{version}.db" + + if db_path.exists(): + logger.info(f"Using cached O*NET database: {db_path}") + conn = sqlite3.connect(db_path) + # Set PRAGMA for foreign keys on every connection + conn.execute("PRAGMA foreign_keys = ON;") + return conn, version + + logger.info(f"Creating new O*NET database: {db_path}") + conn = sqlite3.connect(db_path) + + # Set performance PRAGMAs for fast import + logger.info("Creating new SQLite database with performance settings") + conn.executescript(""" + PRAGMA journal_mode = OFF; + PRAGMA synchronous = 0; + PRAGMA cache_size = 1000000; + PRAGMA locking_mode = EXCLUSIVE; + PRAGMA temp_store = MEMORY; + PRAGMA foreign_keys = ON; + """) + + with zipfile.ZipFile(io.BytesIO(zip_content)) as z: + sql_scripts = [] + for filename in sorted(z.namelist()): + if filename.endswith(".sql"): + sql_scripts.append(z.read(filename).decode('utf-8')) + + if not sql_scripts: + raise RuntimeError("No SQL files found in the O*NET zip archive.") + + # Combine and execute all SQL files in one transaction + full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" + + logger.info("Executing SQL files in alphabetical order (single transaction mode)") + conn.executescript(full_script) + logger.info("Database populated successfully. Restoring reliability settings...") + + # Restore reliability-focused settings after import + conn.executescript(""" + PRAGMA journal_mode = WAL; + PRAGMA synchronous = NORMAL; + PRAGMA locking_mode = NORMAL; + PRAGMA temp_store = DEFAULT; + PRAGMA foreign_keys = ON; + PRAGMA optimize; + """) + conn.execute("VACUUM;") + conn.commit() + logger.info("Reliability settings restored and database optimized successfully!") + + return conn, version + +def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]: + """ + Downloads the OESM national data from the BLS website. + The version is the sha256 of the downloaded zip file. + """ + url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip" + logger.info(f"Downloading OESM data from {url}") + response = requests.get(url) + response.raise_for_status() + + zip_content = response.content + version = hashlib.sha256(zip_content).hexdigest() + logger.info(f"OESM data version (sha256): {version}") + + parquet_path = run.cache_dir / f"oesm_{version}.parquet" + if parquet_path.exists(): + logger.info(f"Using cached OESM data: {parquet_path}") + return pd.read_parquet(parquet_path), version + + logger.info(f"Creating new OESM data cache: {parquet_path}") + with zipfile.ZipFile(io.BytesIO(zip_content)) as z: + # Find the excel file in the zip + excel_filename = None + for filename in z.namelist(): + logger.debug(f"Found file in OESM zip: {filename}") + if filename.lower().endswith(".xlsx"): + excel_filename = filename + break + + if excel_filename is None: + raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.") + + logger.info(f"Reading {excel_filename} from zip archive.") + with z.open(excel_filename) as f: + df = pd.read_excel(f, engine='openpyxl') + + df.to_parquet(parquet_path) + logger.info(f"Saved OESM data to cache: {parquet_path}") + return df, version + +def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]: + """ + Downloads the EPOCH AI remote work task data. + The version is the sha256 of the downloaded CSV file. + """ + # This is the direct download link constructed from the Google Drive share link + url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" + logger.info(f"Downloading EPOCH remote data from Google Drive: {url}") + + # Need to handle potential cookies/redirects from Google Drive + session = requests.Session() + response = session.get(url, stream=True) + response.raise_for_status() + + csv_content = response.content + version = hashlib.sha256(csv_content).hexdigest() + logger.info(f"EPOCH remote data version (sha256): {version}") + + parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet" + if parquet_path.exists(): + logger.info(f"Using cached EPOCH remote data: {parquet_path}") + return pd.read_parquet(parquet_path), version + + logger.info(f"Creating new EPOCH remote data cache: {parquet_path}") + df = pd.read_csv(io.BytesIO(csv_content)) + df.to_parquet(parquet_path) + logger.info(f"Saved EPOCH remote data to cache: {parquet_path}") + + return df, version diff --git a/pipeline/generators/estimate_histplot.py b/pipeline/generators/estimate_histplot.py index 139f99f..4725573 100644 --- a/pipeline/generators/estimate_histplot.py +++ b/pipeline/generators/estimate_histplot.py @@ -2,5 +2,5 @@ from ..run import Run from pathlib import Path from typing import Generator -def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]: +def generate_estimate_histplot(run: Run) -> Generator[Path]: raise NotImplementedError diff --git a/pipeline/logger.py b/pipeline/logger.py new file mode 100644 index 0000000..cc80532 --- /dev/null +++ b/pipeline/logger.py @@ -0,0 +1,24 @@ +import logging +from logging.handlers import RotatingFileHandler +from rich.logging import RichHandler + +LOGGER_NAME = "pipeline" + +def setup_logging() -> logging.Logger: + # Set up Rich console handler + rich_handler = RichHandler( + level=logging.DEBUG, + show_time=True, + enable_link_path=True, + rich_tracebacks=True, + # omit_repeated_times=False, + ) + + logger = logging.getLogger(LOGGER_NAME) + logger.setLevel(logging.DEBUG) + logger.addHandler(rich_handler) + + return logger + + +logger = setup_logging() diff --git a/pipeline/metadata.py b/pipeline/metadata.py index df84459..335644b 100644 --- a/pipeline/metadata.py +++ b/pipeline/metadata.py @@ -16,6 +16,7 @@ class Metadata(BaseModel): versions, and other important information. """ fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict) + enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict) ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S")) commit: str = Field(default_factory=lambda: _get_current_commit()) diff --git a/pipeline/run.py b/pipeline/run.py index 4c28c1b..c476f99 100644 --- a/pipeline/run.py +++ b/pipeline/run.py @@ -1,6 +1,7 @@ from pydantic import BaseModel, Field import sqlite3 import pandas as pd +from pathlib import Path from typing import Optional from .metadata import Metadata @@ -20,3 +21,6 @@ class Run(BaseModel): task_estimates_df: Optional[pd.DataFrame] = None meta: Metadata = Field(default_factory=Metadata) + + cache_dir: Path + output_dir: Path diff --git a/pipeline/runner.py b/pipeline/runner.py index 4217aaa..0228ebd 100644 --- a/pipeline/runner.py +++ b/pipeline/runner.py @@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks from .generators import GENERATORS from .run import Run from .constants import GRAY +import platformdirs import seaborn as sns import matplotlib as mpl from pathlib import Path from typings import Optional +CACHE_DIR = platformdirs.user_cache_dir("econtai") + def run(output_dir: Optional[str] = None): if output_dir is None: output_dir = Path(".") @@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None): load_dotenv() _setup_graph_rendering() - current_run = Run() + current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR) # Fetchers (fetchers.py) - current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta) - current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta) - current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta) + current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run) + current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run) + current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run) # Enrichments (enrichments.py) current_run.task_estimateability_df = enrich_with_task_estimateability(current_run) @@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None): # Generators (generators/) for gen in GENERATORS: - gen(current_run, output_dir) + gen(current_run) def _setup_graph_rendering(): diff --git a/pyproject.toml b/pyproject.toml index d2ee523..8682de7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,12 @@ readme = "README.md" requires-python = ">=3.13" dependencies = [ "matplotlib>=3.10.3", + "openpyxl>=3.1.5", "pandas>=2.2.3", + "platformdirs>=4.3.8", "pydantic>=2.11.7", "python-dotenv>=1.1.1", + "requests>=2.32.4", "seaborn>=0.13.2", ] diff --git a/uv.lock b/uv.lock index 534ca90..d75b52d 100644 --- a/uv.lock +++ b/uv.lock @@ -11,6 +11,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "certifi" +version = "2025.6.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload_time = "2025-06-15T02:45:51.329Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload_time = "2025-06-15T02:45:49.977Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" }, + { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" }, + { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" }, + { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" }, + { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" }, + { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" }, + { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" }, + { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" }, + { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" }, + { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" }, + { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" }, + { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" }, + { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" }, + { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" }, +] + [[package]] name = "contourpy" version = "1.3.2" @@ -51,6 +82,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload_time = "2023-10-07T05:32:16.783Z" }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload_time = "2024-10-25T17:25:40.039Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload_time = "2024-10-25T17:25:39.051Z" }, +] + [[package]] name = "fonttools" version = "4.58.5" @@ -68,6 +108,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d7/d4/1d85a1996b6188cd2713230e002d79a6f3a289bb17cef600cba385848b72/fonttools-4.58.5-py3-none-any.whl", hash = "sha256:e48a487ed24d9b611c5c4b25db1e50e69e9854ca2670e39a3486ffcd98863ec4", size = 1115318, upload_time = "2025-07-03T14:04:45.378Z" }, ] +[[package]] +name = "idna" +version = "3.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.8" @@ -163,6 +212,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload_time = "2025-04-19T22:47:00.147Z" }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload_time = "2024-06-28T14:03:44.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload_time = "2024-06-28T14:03:41.161Z" }, +] + [[package]] name = "packaging" version = "25.0" @@ -254,6 +315,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload_time = "2025-07-01T09:15:50.399Z" }, ] +[[package]] +name = "platformdirs" +version = "4.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload_time = "2025-05-07T22:47:42.121Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload_time = "2025-05-07T22:47:40.376Z" }, +] + [[package]] name = "pydantic" version = "2.11.7" @@ -336,6 +406,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" }, ] +[[package]] +name = "requests" +version = "2.32.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload_time = "2025-06-09T16:43:07.34Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload_time = "2025-06-09T16:43:05.728Z" }, +] + [[package]] name = "seaborn" version = "0.13.2" @@ -365,18 +450,24 @@ version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "matplotlib" }, + { name = "openpyxl" }, { name = "pandas" }, + { name = "platformdirs" }, { name = "pydantic" }, { name = "python-dotenv" }, + { name = "requests" }, { name = "seaborn" }, ] [package.metadata] requires-dist = [ { name = "matplotlib", specifier = ">=3.10.3" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.2.3" }, + { name = "platformdirs", specifier = ">=4.3.8" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "python-dotenv", specifier = ">=1.1.1" }, + { name = "requests", specifier = ">=2.32.4" }, { name = "seaborn", specifier = ">=0.13.2" }, ] @@ -412,3 +503,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be76 wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" }, ] + +[[package]] +name = "urllib3" +version = "2.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload_time = "2025-06-18T14:07:41.644Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload_time = "2025-06-18T14:07:40.39Z" }, +]