old
This commit is contained in:
parent
720f21a85b
commit
43076bcbb1
42 changed files with 237415 additions and 7831 deletions
0
analysis/__init__.py
Normal file
0
analysis/__init__.py
Normal file
207
analysis/data.py
Normal file
207
analysis/data.py
Normal file
|
@ -0,0 +1,207 @@
|
|||
import logging
|
||||
import re
|
||||
import requests
|
||||
import shutil
|
||||
import sqlite3
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logging to provide feedback during the data setup process
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# --- Constants ---
|
||||
# Using a data directory at the root of the project
|
||||
DATA_DIR = Path("data")
|
||||
|
||||
# O*NET database details. We download the MySQL version and convert it to SQLite.
|
||||
ONET_MYSQL_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_mysql.zip"
|
||||
DB_ZIP_PATH = DATA_DIR / "onet_mysql.zip"
|
||||
DB_FILE_PATH = DATA_DIR / "onet.db"
|
||||
EXTRACT_DIR = DATA_DIR / "onet_mysql_extracted"
|
||||
|
||||
# URLs for other required data files are in a separate text data archive.
|
||||
ONET_TEXT_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_text.zip"
|
||||
TEXT_ZIP_PATH = DATA_DIR / "onet_text.zip"
|
||||
TASK_RATINGS_PATH = DATA_DIR / "Task Ratings.txt"
|
||||
DWA_REFERENCE_PATH = DATA_DIR / "DWA Reference.txt"
|
||||
|
||||
|
||||
def setup_data_and_database():
|
||||
"""
|
||||
Main function to orchestrate the data setup.
|
||||
It ensures the data directory exists, then downloads and sets up the O*NET database
|
||||
and any other required data files.
|
||||
"""
|
||||
logging.info("Starting data and database setup...")
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
|
||||
_setup_onet_database()
|
||||
_download_additional_data()
|
||||
|
||||
logging.info("Data and database setup complete.")
|
||||
|
||||
|
||||
def _setup_onet_database():
|
||||
"""
|
||||
Downloads the O*NET MySQL database, extracts it, and imports it into a
|
||||
new SQLite database, following performance best practices from a shell script.
|
||||
This method performs minimal text-based conversion of the MySQL dump to
|
||||
make it compatible with SQLite before importing.
|
||||
"""
|
||||
if DB_FILE_PATH.exists():
|
||||
logging.info("O*NET database already exists at %s. Skipping setup.", DB_FILE_PATH)
|
||||
return
|
||||
|
||||
logging.info("O*NET database not found. Starting fresh setup.")
|
||||
# Ensure the extraction directory is clean before use
|
||||
if EXTRACT_DIR.exists():
|
||||
shutil.rmtree(EXTRACT_DIR)
|
||||
EXTRACT_DIR.mkdir()
|
||||
|
||||
try:
|
||||
# 1. Download if necessary
|
||||
if not DB_ZIP_PATH.exists():
|
||||
logging.info("Downloading O*NET database from %s", ONET_MYSQL_URL)
|
||||
_download_file(ONET_MYSQL_URL, DB_ZIP_PATH)
|
||||
else:
|
||||
logging.info("Using existing O*NET zip file at %s", DB_ZIP_PATH)
|
||||
|
||||
# 2. Extract
|
||||
logging.info("Extracting O*NET database files to %s", EXTRACT_DIR)
|
||||
with zipfile.ZipFile(DB_ZIP_PATH, 'r') as zip_ref:
|
||||
zip_ref.extractall(EXTRACT_DIR)
|
||||
|
||||
# 3. Create new DB with performance PRAGMAs
|
||||
logging.info("Creating new SQLite database with performance settings: %s", DB_FILE_PATH)
|
||||
conn = sqlite3.connect(DB_FILE_PATH)
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
PRAGMA cache_size = 1000000;
|
||||
PRAGMA locking_mode = EXCLUSIVE;
|
||||
PRAGMA temp_store = MEMORY;
|
||||
""")
|
||||
conn.close()
|
||||
|
||||
# 4. Combine all SQL files, convert, and import in a single transaction
|
||||
logging.info("Combining and converting SQL files for single transaction import...")
|
||||
sql_files = sorted(EXTRACT_DIR.rglob('*.sql'))
|
||||
if not sql_files:
|
||||
raise FileNotFoundError(f"No SQL files found in {EXTRACT_DIR}")
|
||||
|
||||
# Concatenate all files into one string
|
||||
mysql_dump = "\n".join([sql_file.read_text(encoding='utf-8') for sql_file in sql_files])
|
||||
|
||||
# Minimal conversion for SQLite: remove backticks and ENGINE clauses
|
||||
sqlite_dump = mysql_dump.replace('`', '')
|
||||
sqlite_dump = re.sub(r'\) ENGINE=InnoDB.*?;', ');', sqlite_dump, flags=re.DOTALL)
|
||||
|
||||
full_script = f"BEGIN TRANSACTION;\n{sqlite_dump}\nCOMMIT;"
|
||||
|
||||
logging.info(f"Importing {len(sql_files)} SQL files into database...")
|
||||
conn = sqlite3.connect(DB_FILE_PATH)
|
||||
conn.executescript(full_script)
|
||||
conn.close()
|
||||
logging.info("Database populated successfully.")
|
||||
|
||||
# 5. Restore reliability settings and optimize
|
||||
logging.info("Restoring reliability settings and optimizing database...")
|
||||
conn = sqlite3.connect(DB_FILE_PATH)
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
PRAGMA locking_mode = NORMAL;
|
||||
PRAGMA temp_store = DEFAULT;
|
||||
PRAGMA foreign_keys = ON;
|
||||
PRAGMA optimize;
|
||||
""")
|
||||
conn.execute("VACUUM;")
|
||||
conn.close()
|
||||
logging.info("Database setup and optimization complete.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed during database setup: %s", e, exc_info=True)
|
||||
if DB_FILE_PATH.exists():
|
||||
DB_FILE_PATH.unlink()
|
||||
raise
|
||||
finally:
|
||||
# 6. Cleanup
|
||||
logging.info("Cleaning up temporary files...")
|
||||
if DB_ZIP_PATH.exists():
|
||||
DB_ZIP_PATH.unlink()
|
||||
if EXTRACT_DIR.exists():
|
||||
shutil.rmtree(EXTRACT_DIR)
|
||||
|
||||
|
||||
def _download_additional_data():
|
||||
"""
|
||||
Downloads and extracts supplementary data files from the O*NET text archive.
|
||||
If the required text files already exist, this function does nothing.
|
||||
"""
|
||||
required_files = [TASK_RATINGS_PATH, DWA_REFERENCE_PATH]
|
||||
if all(p.exists() for p in required_files):
|
||||
logging.info("All required text data files already exist. Skipping download.")
|
||||
return
|
||||
|
||||
logging.info("One or more text data files are missing. Downloading and extracting from archive...")
|
||||
try:
|
||||
_download_file(ONET_TEXT_URL, TEXT_ZIP_PATH)
|
||||
logging.info("Unzipping text data archive...")
|
||||
with zipfile.ZipFile(TEXT_ZIP_PATH, 'r') as zip_ref:
|
||||
# Extract only the files we need, without creating subdirectories
|
||||
for target_path in required_files:
|
||||
if not target_path.exists():
|
||||
# Find the corresponding file within the zip archive's directory structure
|
||||
member_name = next((m for m in zip_ref.namelist() if m.endswith(target_path.name)), None)
|
||||
if member_name:
|
||||
with zip_ref.open(member_name) as source, open(target_path, 'wb') as target:
|
||||
target.write(source.read())
|
||||
logging.info("Extracted %s", target_path.name)
|
||||
else:
|
||||
logging.warning("Could not find %s in the text data archive.", target_path.name)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error("Failed to download O*NET text data archive: %s", e)
|
||||
raise
|
||||
except zipfile.BadZipFile as e:
|
||||
logging.error("Failed to process the text data archive: %s", e)
|
||||
raise
|
||||
finally:
|
||||
# Clean up the downloaded zip file
|
||||
if TEXT_ZIP_PATH.exists():
|
||||
TEXT_ZIP_PATH.unlink()
|
||||
logging.info("Cleaned up downloaded text archive zip file.")
|
||||
|
||||
|
||||
def _download_file(url, destination):
|
||||
"""
|
||||
Helper function to download a file from a URL, with streaming for large files.
|
||||
"""
|
||||
logging.info("Downloading from %s to %s", url, destination)
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(destination, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
logging.info("Download of %s complete.", destination.name)
|
||||
|
||||
|
||||
def get_db_connection():
|
||||
"""
|
||||
Establishes and returns a connection to the SQLite database.
|
||||
Returns None if the database file does not exist.
|
||||
"""
|
||||
if not DB_FILE_PATH.exists():
|
||||
logging.error("Database file not found at %s. Run the setup process first.", DB_FILE_PATH)
|
||||
return None
|
||||
try:
|
||||
conn = sqlite3.connect(DB_FILE_PATH)
|
||||
return conn
|
||||
except sqlite3.Error as e:
|
||||
logging.error("Failed to connect to the database: %s", e)
|
||||
return None
|
||||
|
||||
if __name__ == '__main__':
|
||||
# This allows the data setup to be run directly from the command line,
|
||||
# which is useful for initialization or debugging.
|
||||
setup_data_and_database()
|
76
analysis/generate.py
Normal file
76
analysis/generate.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
import importlib
|
||||
import logging
|
||||
import pkgutil
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
# The final destination for all generated outputs
|
||||
DIST_DIR = Path("dist")
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def create_all_outputs(processed_df):
|
||||
"""
|
||||
Dynamically discovers, imports, and runs all output generators.
|
||||
|
||||
This function iterates through all modules in the 'analysis.generators'
|
||||
package. For each module, it assumes there is a 'generate(data)' function,
|
||||
which it calls with the provided preprocessed DataFrame.
|
||||
|
||||
The generator function is expected to save its output to a temporary file
|
||||
and return the path to that file. This function then moves the output
|
||||
|
||||
to the 'dist/' directory.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The fully preprocessed data to be used
|
||||
by the generator functions.
|
||||
"""
|
||||
logging.info("Starting output generation...")
|
||||
DIST_DIR.mkdir(exist_ok=True)
|
||||
logging.info(f"Output directory is '{DIST_DIR.resolve()}'")
|
||||
|
||||
# Path to the generators package
|
||||
from . import generators as generators_package
|
||||
generators_path = generators_package.__path__
|
||||
generators_prefix = generators_package.__name__ + "."
|
||||
|
||||
generated_files_count = 0
|
||||
|
||||
# Discover and run all modules in the generators package
|
||||
for _, module_name, _ in pkgutil.iter_modules(generators_path, prefix=generators_prefix):
|
||||
try:
|
||||
logging.info(f"--- Running generator: {module_name} ---")
|
||||
|
||||
# Import the generator module
|
||||
generator_module = importlib.import_module(module_name)
|
||||
|
||||
# Check if the module has the required 'generate' function
|
||||
if not hasattr(generator_module, 'generate'):
|
||||
logging.warning(f"Generator module {module_name} does not have a 'generate' function. Skipping.")
|
||||
continue
|
||||
|
||||
# Call the generator function, passing in the preprocessed data
|
||||
generator_func = getattr(generator_module, 'generate')
|
||||
temp_output_path = generator_func(processed_df)
|
||||
|
||||
# If the generator returned a path, move the file to the dist directory
|
||||
if temp_output_path and isinstance(temp_output_path, Path) and temp_output_path.exists():
|
||||
# Sanitize the module name to create a valid filename
|
||||
base_filename = module_name.split('.')[-1]
|
||||
# Keep the original extension from the temp file
|
||||
final_filename = base_filename + temp_output_path.suffix
|
||||
final_output_path = DIST_DIR / final_filename
|
||||
|
||||
shutil.move(temp_output_path, final_output_path)
|
||||
logging.info(f"Successfully generated '{final_output_path.name}'")
|
||||
generated_files_count += 1
|
||||
else:
|
||||
logging.warning(f"Generator {module_name} did not return a valid output file path. Nothing was saved.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to run generator {module_name}. Error: {e}", exc_info=True)
|
||||
# Continue to the next generator
|
||||
|
||||
logging.info(f"--- Output generation complete. Total files generated: {generated_files_count} ---")
|
0
analysis/generators/__init__.py
Normal file
0
analysis/generators/__init__.py
Normal file
119
analysis/generators/estimate_lower_vs_upper_bounds.py
Normal file
119
analysis/generators/estimate_lower_vs_upper_bounds.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Copied from other generators for modularity. This dictionary maps
|
||||
# O*NET major occupation group codes to human-readable labels.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a scatter plot comparing lower vs. upper time estimates for tasks.
|
||||
|
||||
This corresponds to 'cell3' from the original analysis notebook. It helps
|
||||
visualize the relationship and spread between the lower and upper bounds
|
||||
|
||||
of time estimates across different occupation groups.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes', 'onetsoc_major'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating plot of lower vs. upper time estimates...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# For log scaling, both lower and upper bounds must be positive.
|
||||
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
|
||||
if df.empty:
|
||||
logging.warning("No data with positive lower and upper estimates available to plot.")
|
||||
return None
|
||||
|
||||
# Replace the major code with its readable label for the hue legend.
|
||||
df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(12, 10))
|
||||
ax = sns.scatterplot(
|
||||
data=df,
|
||||
x='lb_estimate_in_minutes',
|
||||
y='ub_estimate_in_minutes',
|
||||
alpha=0.2,
|
||||
edgecolor=None,
|
||||
hue="occupation_label" # Use the labeled column for the legend
|
||||
)
|
||||
|
||||
# Determine limits for the 45° reference line
|
||||
# Use the maximum of both columns to create a square plot
|
||||
max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
|
||||
lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
|
||||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
|
||||
|
||||
# Add helper lines for constant ratios (2x, 10x, 100x)
|
||||
for k in [2, 10, 100]:
|
||||
ax.plot(lims, [k * l for l in lims],
|
||||
linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
|
||||
|
||||
ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
|
||||
ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
|
||||
ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
|
||||
ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
|
||||
|
||||
# Place the legend outside the plot to avoid obscuring data
|
||||
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
|
||||
|
||||
# Use bbox_inches='tight' to ensure the external legend is included in the saved image.
|
||||
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
86
analysis/generators/estimate_ratio_distribution.py
Normal file
86
analysis/generators/estimate_ratio_distribution.py
Normal file
|
@ -0,0 +1,86 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a histogram of the log-ratio of upper to lower time estimates.
|
||||
|
||||
This corresponds to 'cell4' from the original analysis notebook. It shows
|
||||
the distribution of how many times larger the upper estimate is compared
|
||||
to the lower estimate.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating distribution plot of estimate ratios...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# Calculate the ratio. We need to handle cases where the lower bound is zero.
|
||||
# Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
|
||||
# Here, we filter, as a ratio with a zero denominator is undefined.
|
||||
df = df[df['lb_estimate_in_minutes'] > 0]
|
||||
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
|
||||
|
||||
# Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
|
||||
# and drop rows with NaN or infinite ratios.
|
||||
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
df.dropna(subset=['estimate_ratio'], inplace=True)
|
||||
|
||||
if df.empty:
|
||||
logging.warning("No valid data available to plot the estimate ratio distribution.")
|
||||
return None
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# We plot the log10 of the ratio to better visualize the wide distribution
|
||||
log_ratio = np.log10(df['estimate_ratio'])
|
||||
|
||||
sns.histplot(log_ratio, bins=60, kde=True)
|
||||
|
||||
# Add vertical lines for reference points
|
||||
# log10(1) = 0, which is where upper bound equals lower bound
|
||||
plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
|
||||
# A small ratio, e.g., 5% difference
|
||||
plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
|
||||
# A 10x ratio
|
||||
plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
|
||||
|
||||
plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
|
||||
plt.ylabel('Number of Tasks', fontsize=12)
|
||||
plt.title('Distribution of Time Estimate Ratios', fontsize=16)
|
||||
plt.legend()
|
||||
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
||||
plt.tight_layout()
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
|
||||
plt.savefig(temp_path, dpi=300)
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
|
@ -0,0 +1,135 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
# This mapping helps translate the O*NET 2-digit major group codes
|
||||
# into human-readable labels for the plot's y-axis.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a heatmap of the median estimate ratio by occupation and task length quartile.
|
||||
|
||||
This corresponds to 'cell5' from the original analysis notebook. It shows
|
||||
how the ratio between upper and lower time estimates varies across
|
||||
different occupations and for tasks of different typical lengths (binned
|
||||
into quartiles).
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes', 'onetsoc_major'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating heatmap of estimate ratios by occupation and task length...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# Calculate the estimate ratio, handling division by zero and infinity
|
||||
df = df[df['lb_estimate_in_minutes'] > 0]
|
||||
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
|
||||
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
||||
df.dropna(subset=['estimate_ratio'], inplace=True)
|
||||
|
||||
if df.empty:
|
||||
logging.warning("No valid data available for the ratio heatmap.")
|
||||
return None
|
||||
|
||||
# 1. Bin lower bounds into quartiles (Q1–Q4)
|
||||
# Using duplicates='drop' can help if there are many identical values
|
||||
# which can make binning into quantiles fail.
|
||||
try:
|
||||
df['lb_q'] = pd.qcut(
|
||||
df.lb_estimate_in_minutes,
|
||||
q=4,
|
||||
labels=['Q1 (Shortest)', 'Q2', 'Q3', 'Q4 (Longest)'],
|
||||
duplicates='drop'
|
||||
)
|
||||
except ValueError as e:
|
||||
logging.error(f"Could not bin data into quartiles: {e}. There might not be enough unique values.")
|
||||
return None
|
||||
|
||||
|
||||
# 2. Aggregate: median ratio per cell (occupation x task length quartile)
|
||||
pivot = df.pivot_table(
|
||||
index='onetsoc_major',
|
||||
columns='lb_q',
|
||||
values='estimate_ratio',
|
||||
aggfunc='median'
|
||||
)
|
||||
|
||||
# Map the index (onetsoc_major codes) to their corresponding readable labels
|
||||
pivot.index = pivot.index.map(OCCUPATION_MAJOR_CODES)
|
||||
pivot.dropna(inplace=True) # Drop occupations with no data in some quartiles for a cleaner plot
|
||||
|
||||
if pivot.empty:
|
||||
logging.warning("Pivot table is empty after processing. Cannot generate heatmap.")
|
||||
return None
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(12, 10))
|
||||
sns.heatmap(
|
||||
pivot,
|
||||
cmap='RdYlGn_r', # Red-Yellow-Green (reversed), good for ratios centered around 1
|
||||
center=2, # Center the colormap around a ratio of 2
|
||||
annot=True, # Show the median values in the cells
|
||||
fmt='.1f', # Format annotations to one decimal place
|
||||
linewidths=.5,
|
||||
cbar_kws={'label': 'Median Upper/Lower Estimate Ratio'}
|
||||
)
|
||||
plt.xlabel('Task Length (based on lower-bound quartile)', fontsize=12)
|
||||
plt.ylabel('Occupation Major Group', fontsize=12)
|
||||
plt.title('Typical Estimate Range Width by Occupation and Task Length', fontsize=16)
|
||||
plt.tight_layout()
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "ratio_heatmap_by_occupation_and_task_length.png"
|
||||
plt.savefig(temp_path, dpi=300)
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the heatmap: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
161
analysis/generators/task_breakdown_by_occupation.py
Normal file
161
analysis/generators/task_breakdown_by_occupation.py
Normal file
|
@ -0,0 +1,161 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mtick
|
||||
import matplotlib.colors as mcolors
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
# This mapping helps translate the O*NET 2-digit major group codes
|
||||
# into human-readable labels for the plot's y-axis.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
# Define colors to match the original notebook's palette.
|
||||
# These are standard hex codes for gray and lime shades.
|
||||
BAR_COLORS = [
|
||||
'#D1D5DB', # gray-300
|
||||
'#84CC16', # lime-500
|
||||
'#D9F99D', # lime-200
|
||||
]
|
||||
|
||||
|
||||
def _get_contrasting_text_color(bg_color_hex):
|
||||
"""
|
||||
Determines if black or white text provides better contrast against a given background color.
|
||||
"""
|
||||
try:
|
||||
rgba = mcolors.to_rgba(bg_color_hex)
|
||||
# Calculate luminance (Y) using the sRGB formula
|
||||
luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
|
||||
return 'black' if luminance > 0.55 else 'white'
|
||||
except ValueError:
|
||||
return 'black' # Default to black if color is invalid
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a stacked bar chart breaking down tasks by remote status and estimability.
|
||||
|
||||
This corresponds to 'cell10' from the original analysis notebook. It shows,
|
||||
for each occupation, the percentage of tasks that are not remote, remote and
|
||||
estimable, or remote and not estimable.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'onetsoc_major', 'remote_status', 'estimateable'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating task breakdown by occupation plot...")
|
||||
|
||||
# --- Data Validation ---
|
||||
required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# --- Data Summarization ---
|
||||
summary_data = []
|
||||
for code, label in OCCUPATION_MAJOR_CODES.items():
|
||||
occ_df = df[df['onetsoc_major'] == code]
|
||||
total_tasks = len(occ_df)
|
||||
if total_tasks == 0:
|
||||
continue
|
||||
|
||||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||||
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
|
||||
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
|
||||
|
||||
summary_data.append({
|
||||
'occupation_label': label,
|
||||
'count_not_remote': not_remote_count,
|
||||
'count_remote_atomic': remote_atomic_count,
|
||||
'count_remote_ongoing': remote_ongoing_count,
|
||||
'total_tasks': total_tasks
|
||||
})
|
||||
|
||||
if not summary_data:
|
||||
logging.warning("No data available to generate the task breakdown plot.")
|
||||
return None
|
||||
|
||||
summary_df = pd.DataFrame(summary_data)
|
||||
|
||||
# --- Percentage Calculation ---
|
||||
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
|
||||
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
|
||||
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
|
||||
|
||||
plot_df = summary_df.set_index('occupation_label')[
|
||||
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
|
||||
]
|
||||
plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
|
||||
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
|
||||
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(14, 10))
|
||||
plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
|
||||
|
||||
ax.set_xlabel("Percentage of Tasks", fontsize=12)
|
||||
ax.set_ylabel("Occupation Major Group", fontsize=12)
|
||||
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
|
||||
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
|
||||
ax.set_xlim(0, 100)
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
|
||||
# Add percentage labels inside each bar segment
|
||||
for i, container in enumerate(ax.containers):
|
||||
text_color = _get_contrasting_text_color(BAR_COLORS[i])
|
||||
for patch in container.patches:
|
||||
width = patch.get_width()
|
||||
if width > 3: # Only label segments wider than 3%
|
||||
x = patch.get_x() + width / 2
|
||||
y = patch.get_y() + patch.get_height() / 2
|
||||
ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
|
||||
fontsize=8, color=text_color, fontweight='medium')
|
||||
|
||||
ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
|
||||
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
74
analysis/generators/task_estimate_distribution.py
Normal file
74
analysis/generators/task_estimate_distribution.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
import pandas as pd
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a histogram of the task time estimate midpoints.
|
||||
|
||||
This generator corresponds to 'cell1' from the original analysis notebook.
|
||||
It visualizes the distribution of the calculated midpoint of time estimates
|
||||
for all tasks on a logarithmic scale to handle the wide range of values.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data, expected to contain
|
||||
'lb_estimate_in_minutes' and
|
||||
'ub_estimate_in_minutes' columns.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None if
|
||||
generation fails.
|
||||
"""
|
||||
logging.info("Generating task estimate distribution plot...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(
|
||||
f"Required columns {required_cols} not found in the DataFrame. "
|
||||
"Cannot generate plot."
|
||||
)
|
||||
return None
|
||||
|
||||
# Create a copy to avoid modifying the original DataFrame
|
||||
df = processed_df.copy()
|
||||
|
||||
# Calculate the midpoint from lower and upper bounds, as was done in the notebook
|
||||
df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
|
||||
|
||||
# For log scaling, we must use positive values. Filter out any non-positive midpoints.
|
||||
df = df[df['estimate_midpoint'] > 0]
|
||||
if df.empty:
|
||||
logging.warning("No data with positive estimate midpoints available to plot.")
|
||||
return None
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(10, 6))
|
||||
ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)
|
||||
|
||||
ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
|
||||
ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
|
||||
ax.set_ylabel('Number of Tasks', fontsize=12)
|
||||
plt.tight_layout()
|
||||
|
||||
# --- File Saving ---
|
||||
# Create a temporary file to save the plot. The orchestrator (`generate.py`)
|
||||
# will move this to the final 'dist/' directory.
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "task_estimate_distribution.png"
|
||||
|
||||
plt.savefig(temp_path, dpi=300)
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
# Close the figure to free up memory, which is crucial when running many generators.
|
||||
plt.close()
|
134
analysis/generators/temporal_coherence_cdf.py
Normal file
134
analysis/generators/temporal_coherence_cdf.py
Normal file
|
@ -0,0 +1,134 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
# Replicating the color palette from the original notebook for consistency.
|
||||
# These appear to be inspired by Tailwind CSS colors.
|
||||
GRAY_PALETTE = {
|
||||
'100': '#F3F4F6',
|
||||
'300': '#D1D5DB',
|
||||
}
|
||||
LIME_PALETTE = {
|
||||
'300': '#D9F99D',
|
||||
'600': '#A3E635', # A mid-tone lime
|
||||
'900': '#4D7C0F', # A dark lime/green
|
||||
}
|
||||
|
||||
|
||||
def _calculate_cdf(series: pd.Series):
|
||||
"""
|
||||
Calculates the empirical Cumulative Distribution Function (CDF) for a series.
|
||||
Returns the sorted values and their corresponding cumulative percentages.
|
||||
"""
|
||||
# Drop NA values and ensure the series is sorted
|
||||
s = series.dropna().sort_values().reset_index(drop=True)
|
||||
# Calculate cumulative percentage: (index + 1) / total_count
|
||||
cdf_y = ((s.index + 1) / len(s)) * 100
|
||||
return s.values, cdf_y
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
|
||||
|
||||
This corresponds to the second 'cell11' from the original notebook. It plots
|
||||
the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
|
||||
showing the percentage of tasks that can be completed within a certain time.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating temporal coherence CDF plot...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# Log scale requires positive values.
|
||||
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
|
||||
if df.empty:
|
||||
logging.warning("No data with positive estimates available to generate CDF plot.")
|
||||
return None
|
||||
|
||||
# Calculate mid-point estimate
|
||||
df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
|
||||
|
||||
# Prepare data for CDF plots
|
||||
x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
|
||||
x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
|
||||
x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
|
||||
# --- Grid and Reference Lines ---
|
||||
# Horizontal reference lines for percentages
|
||||
for y_val in range(0, 101, 10):
|
||||
ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
|
||||
|
||||
# Vertical reference lines for human-friendly durations
|
||||
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
|
||||
for tick in ticks:
|
||||
ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
|
||||
|
||||
# --- CDF Plots ---
|
||||
ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
|
||||
ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
|
||||
ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
|
||||
|
||||
# --- Axes Configuration ---
|
||||
ax.set_ylim(0, 100)
|
||||
ax.set_xscale('log')
|
||||
|
||||
# Custom x-ticks for durations
|
||||
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
|
||||
ax.set_xticks(ticks)
|
||||
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
||||
ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
|
||||
|
||||
# Format y-axis as percentages
|
||||
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
|
||||
|
||||
# --- Spines and Labels ---
|
||||
for spine in ['top', 'right']:
|
||||
ax.spines[spine].set_visible(False)
|
||||
for spine in ['left', 'bottom']:
|
||||
ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
|
||||
|
||||
# Use ax.text for more control over label placement than ax.set_ylabel/xlabel
|
||||
ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
|
||||
fontsize=12, fontweight='semibold', va='bottom')
|
||||
ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
|
||||
fontsize=12, fontweight='semibold', ha='center')
|
||||
|
||||
ax.legend(frameon=False, loc='lower right')
|
||||
fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
|
||||
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
|
||||
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
112
analysis/generators/time_estimate_spread_by_occupation.py
Normal file
112
analysis/generators/time_estimate_spread_by_occupation.py
Normal file
|
@ -0,0 +1,112 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
import pandas as pd
|
||||
|
||||
# Based on O*NET SOC 2018 structure, this mapping helps translate
|
||||
# the 2-digit major group codes into human-readable labels.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a box plot showing the spread of time-range estimates per occupation.
|
||||
|
||||
This corresponds to 'cell2' from the original analysis notebook. It visualizes
|
||||
the distribution of the difference between upper and lower time estimates for
|
||||
each major occupational group.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes', 'onetsoc_major'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating plot of time estimate spread by occupation...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# Calculate the estimate range.
|
||||
df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
|
||||
|
||||
# For log scaling, we need positive values. Filter out any non-positive ranges.
|
||||
df = df[df['estimate_range'] > 0]
|
||||
if df.empty:
|
||||
logging.warning("No data with a positive estimate range available to plot.")
|
||||
return None
|
||||
|
||||
# Sort by the major code to ensure a consistent plot order
|
||||
df = df.sort_values('onetsoc_major')
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(14, 10))
|
||||
|
||||
ax = sns.boxplot(
|
||||
data=df,
|
||||
x='onetsoc_major',
|
||||
y='estimate_range',
|
||||
showfliers=False # Outliers are excluded for a clearer view of the main distribution
|
||||
)
|
||||
|
||||
plt.yscale('log') # The long tail of the data makes a log scale more readable
|
||||
plt.xlabel('Occupation Major Group', fontsize=12)
|
||||
plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
|
||||
plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
|
||||
|
||||
# Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
|
||||
ax.set_xticklabels(
|
||||
[OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
|
||||
rotation=60,
|
||||
ha='right' # Align rotated labels correctly
|
||||
)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
|
||||
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
150
analysis/generators/wage_bill_by_occupation.py
Normal file
150
analysis/generators/wage_bill_by_occupation.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mticker
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
# Assuming data.py is in the same package and provides this function
|
||||
from ..data import get_db_connection
|
||||
|
||||
# This mapping helps translate the O*NET 2-digit major group codes
|
||||
# into human-readable labels for the plot's y-axis.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a bar plot of the total wage bill per major occupation group.
|
||||
|
||||
This corresponds to the first 'cell11' from the original analysis notebook.
|
||||
It calculates the total wage bill (Total Employment * Annual Mean Wage) for
|
||||
each occupation and aggregates it by major occupation group. This generator
|
||||
loads its data directly from the O*NET database.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data (not used in this generator,
|
||||
but required by the function signature).
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating plot of total wage bill by occupation...")
|
||||
conn = None
|
||||
try:
|
||||
# --- Data Loading ---
|
||||
# This generator needs specific data that is not in the main preprocessed_df.
|
||||
# It loads occupational employment and wage data directly from the database.
|
||||
conn = get_db_connection()
|
||||
if conn is None:
|
||||
raise ConnectionError("Could not get database connection.")
|
||||
|
||||
# This data is stored in a long format in the `occupation_level_metadata` table.
|
||||
# We need to query this table and pivot it to get employment and wage columns.
|
||||
query = "SELECT onetsoc_code, item, response FROM occupation_level_metadata WHERE item IN ('Employment', 'Annual Mean Wage')"
|
||||
try:
|
||||
df_meta = pd.read_sql_query(query, conn)
|
||||
|
||||
# Pivot the table to create 'Employment' and 'Annual Mean Wage' columns
|
||||
df_oesm = df_meta.pivot(index='onetsoc_code', columns='item', values='response').reset_index()
|
||||
logging.info("Pivoted occupation metadata. Columns are: %s", df_oesm.columns.tolist())
|
||||
|
||||
# Rename for consistency with the original notebook's code
|
||||
df_oesm.rename(columns={
|
||||
'onetsoc_code': 'OCC_CODE',
|
||||
'Employment': 'TOT_EMP',
|
||||
'Annual Mean Wage': 'A_MEAN'
|
||||
}, inplace=True)
|
||||
except (pd.io.sql.DatabaseError, KeyError) as e:
|
||||
logging.error(f"Failed to query or pivot occupation metadata: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
# --- Data Preparation ---
|
||||
# Create a 'major group' code from the first two digits of the SOC code
|
||||
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
|
||||
|
||||
# Ensure wage and employment columns are numeric, coercing errors to NaN
|
||||
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
|
||||
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
|
||||
|
||||
# Drop rows with missing data in critical columns
|
||||
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
|
||||
|
||||
# Calculate the wage bill for each occupation
|
||||
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
|
||||
|
||||
# Aggregate the wage bill by major occupation group
|
||||
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
|
||||
|
||||
# Map the major codes to readable titles for plotting
|
||||
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
|
||||
df_wage_bill_major.dropna(subset=['OCC_TITLE_MAJOR'], inplace=True) # Drop military/unmapped codes
|
||||
|
||||
# Sort by wage bill for a more informative plot
|
||||
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
|
||||
|
||||
if df_wage_bill_major.empty:
|
||||
logging.warning("No data available to generate the wage bill plot.")
|
||||
return None
|
||||
|
||||
|
||||
# --- Plotting ---
|
||||
plt.figure(figsize=(12, 10))
|
||||
ax = sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis", orient='h')
|
||||
ax.set_title('Total Wage Bill per Major Occupation Group', fontsize=16, pad=15)
|
||||
ax.set_xlabel('Total Wage Bill (in USD)', fontsize=12)
|
||||
ax.set_ylabel('Major Occupation Group', fontsize=12)
|
||||
ax.grid(axis='x', linestyle='--', alpha=0.7)
|
||||
|
||||
# Format the x-axis to be more readable (e.g., "$2.0T" for trillions)
|
||||
def format_billions(x, pos):
|
||||
if x >= 1e12:
|
||||
return f'${x*1e-12:.1f}T'
|
||||
if x >= 1e9:
|
||||
return f'${x*1e-9:.0f}B'
|
||||
return f'${x*1e-6:.0f}M'
|
||||
ax.xaxis.set_major_formatter(mticker.FuncFormatter(format_billions))
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "wage_bill_by_occupation.png"
|
||||
plt.savefig(temp_path, dpi=300)
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the wage bill plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
||||
if conn:
|
||||
conn.close()
|
64
analysis/main.py
Normal file
64
analysis/main.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
# Since this file is inside the 'analysis' package, we use relative imports
|
||||
# to access the other modules within the same package.
|
||||
from . import data
|
||||
from . import preprocess
|
||||
from . import generate
|
||||
|
||||
# Configure logging for the entire application.
|
||||
# This setup will apply to loggers in data, preprocess, and generate modules as well.
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
stream=sys.stdout
|
||||
)
|
||||
|
||||
def main():
|
||||
"""
|
||||
The main entry point for the entire analysis pipeline.
|
||||
|
||||
This function orchestrates the three main stages of the analysis:
|
||||
1. Data Setup: Downloads and prepares the necessary raw data and database.
|
||||
2. Preprocessing: Cleans, enriches, and transforms the raw data into an
|
||||
analysis-ready DataFrame.
|
||||
3. Output Generation: Runs all registered generators to produce figures,
|
||||
tables, and other outputs, saving them to the 'dist/' directory.
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("=================================================")
|
||||
logger.info(" STARTING ECONTAI ANALYSIS PIPELINE ")
|
||||
logger.info("=================================================")
|
||||
|
||||
try:
|
||||
# Stage 1: Set up the data and database
|
||||
logger.info("--- STAGE 1: DATA SETUP ---")
|
||||
data.setup_data_and_database()
|
||||
logger.info("--- DATA SETUP COMPLETE ---")
|
||||
|
||||
# Stage 2: Run the preprocessing pipeline
|
||||
logger.info("--- STAGE 2: PREPROCESSING ---")
|
||||
processed_dataframe = preprocess.run_preprocessing()
|
||||
logger.info("--- PREPROCESSING COMPLETE ---")
|
||||
|
||||
# Stage 3: Generate all outputs
|
||||
logger.info("--- STAGE 3: OUTPUT GENERATION ---")
|
||||
generate.create_all_outputs(processed_dataframe)
|
||||
logger.info("--- OUTPUT GENERATION COMPLETE ---")
|
||||
|
||||
logger.info("=================================================")
|
||||
logger.info(" ANALYSIS PIPELINE COMPLETED SUCCESSFULLY ")
|
||||
logger.info("=================================================")
|
||||
|
||||
except Exception as e:
|
||||
logger.critical("An unrecoverable error occurred during the pipeline execution.", exc_info=True)
|
||||
# Exit with a non-zero status code to indicate failure, which is useful for automation.
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# This allows the script to be run from the command line using `python -m analysis.main`.
|
||||
# The `-m` flag is important because it adds the parent directory to the Python path,
|
||||
# allowing the relative imports (e.g., `from . import data`) to work correctly.
|
||||
if __name__ == '__main__':
|
||||
main()
|
160
analysis/preprocess.py
Normal file
160
analysis/preprocess.py
Normal file
|
@ -0,0 +1,160 @@
|
|||
import logging
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.stats import median_abs_deviation
|
||||
from .data import get_db_connection
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
|
||||
def _convert_to_minutes(level: float) -> float:
|
||||
"""
|
||||
Converts O*NET 'Frequency' scale values (levels) to estimated minutes per day.
|
||||
This logic is derived from the `preprocessing_time_estimates` function
|
||||
in the original analysis notebook.
|
||||
"""
|
||||
if pd.isna(level):
|
||||
return 0
|
||||
# This mapping is an interpretation of the O*NET frequency scale.
|
||||
return {
|
||||
1: 0, # Yearly or less
|
||||
2: 2, # Several times a year
|
||||
3: 10, # Several times a month
|
||||
4: 30, # Several times a week
|
||||
5: 120, # Daily
|
||||
6: 240, # Several times a day
|
||||
7: 480, # Hourly or more
|
||||
}.get(int(level), 0)
|
||||
|
||||
|
||||
def _mad_z_score(series: pd.Series) -> pd.Series:
|
||||
"""
|
||||
Calculates the robust Z-score using Median Absolute Deviation (MAD).
|
||||
This function is derived from 'cell7' of the original analysis.
|
||||
"""
|
||||
if series.isnull().all():
|
||||
return pd.Series([np.nan] * len(series), index=series.index)
|
||||
|
||||
median = series.median()
|
||||
# scale='normal' makes MAD comparable to the standard deviation for a normal distribution.
|
||||
mad = median_abs_deviation(series.dropna(), scale='normal')
|
||||
if mad == 0:
|
||||
return pd.Series([np.nan] * len(series), index=series.index)
|
||||
return (series - median) / mad
|
||||
|
||||
|
||||
def run_preprocessing() -> pd.DataFrame:
|
||||
"""
|
||||
Main orchestrator for the preprocessing pipeline.
|
||||
|
||||
This function faithfully reproduces the data transformation pipeline from the
|
||||
original `analysis.py` script, including the `preprocessing_time_estimates`
|
||||
and cell-specific data manipulations.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: A fully preprocessed DataFrame ready for the generators.
|
||||
"""
|
||||
logging.info("Starting data preprocessing...")
|
||||
conn = None
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
if conn is None:
|
||||
raise ConnectionError("Could not establish database connection.")
|
||||
|
||||
# --- 1. Load Data from Database ---
|
||||
# Fetch all necessary tables to build the initial DataFrame.
|
||||
logging.info("Loading data from O*NET database...")
|
||||
task_ratings_df = pd.read_sql_query("SELECT * FROM task_ratings", conn)
|
||||
task_statements_df = pd.read_sql_query("SELECT * FROM task_statements", conn)
|
||||
occupations_df = pd.read_sql_query("SELECT * FROM occupation_data", conn)
|
||||
|
||||
# --- 2. Initial Merge ---
|
||||
# Merge the tables to create a comprehensive base DataFrame.
|
||||
# Merging on both 'onetsoc_code' and 'task_id' is crucial to avoid
|
||||
# creating duplicate columns from the overlapping 'onetsoc_code'.
|
||||
logging.info("Merging base tables...")
|
||||
tasks_df = pd.merge(task_ratings_df, task_statements_df, on=['onetsoc_code', 'task_id'])
|
||||
tasks_df = pd.merge(tasks_df, occupations_df, on='onetsoc_code')
|
||||
|
||||
# --- 3. Create "Atomic Tasks" and Time Estimates (from `preprocessing_time_estimates`) ---
|
||||
# This is the core of the analysis, focusing on tasks with frequency ratings.
|
||||
logging.info("Filtering for 'atomic tasks' (scale_id='FR') and calculating time estimates...")
|
||||
# Strip whitespace from scale_id to ensure the filter works correctly.
|
||||
tasks_df['scale_id'] = tasks_df['scale_id'].str.strip()
|
||||
atomic_tasks = tasks_df[tasks_df['scale_id'] == 'FR'].copy()
|
||||
|
||||
# Convert frequency confidence intervals into minutes/day
|
||||
atomic_tasks['lb_estimate_in_minutes'] = atomic_tasks['lower_ci_bound'].apply(_convert_to_minutes)
|
||||
atomic_tasks['ub_estimate_in_minutes'] = atomic_tasks['upper_ci_bound'].apply(_convert_to_minutes)
|
||||
atomic_tasks['estimate_midpoint'] = (atomic_tasks['lb_estimate_in_minutes'] + atomic_tasks['ub_estimate_in_minutes']) / 2
|
||||
|
||||
# --- 4. Add Derived Columns for Analysis (from `cell` logic) ---
|
||||
logging.info("Adding derived columns for analysis...")
|
||||
|
||||
# Add `onetsoc_major` for grouping by occupation category
|
||||
atomic_tasks['onetsoc_major'] = atomic_tasks['onetsoc_code'].str[:2]
|
||||
|
||||
# Calculate estimate_range and estimate_ratio used in several plots
|
||||
atomic_tasks['estimate_range'] = atomic_tasks['ub_estimate_in_minutes'] - atomic_tasks['lb_estimate_in_minutes']
|
||||
|
||||
# To calculate ratio, ensure lower bound is positive to avoid division by zero
|
||||
lb_positive = atomic_tasks['lb_estimate_in_minutes'] > 0
|
||||
atomic_tasks['estimate_ratio'] = np.nan
|
||||
atomic_tasks.loc[lb_positive, 'estimate_ratio'] = atomic_tasks['ub_estimate_in_minutes'] / atomic_tasks['lb_estimate_in_minutes']
|
||||
|
||||
# --- 5. Calculate Outlier Scores (from `cell6` and `cell7`) ---
|
||||
logging.info("Calculating standard and robust Z-scores for outlier detection...")
|
||||
|
||||
# Standard Z-score
|
||||
grouped_stats = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].agg(['mean', 'std'])
|
||||
atomic_tasks = atomic_tasks.merge(grouped_stats, on='onetsoc_code', how='left')
|
||||
|
||||
# Calculate Z-score, avoiding division by zero if std is 0
|
||||
non_zero_std = atomic_tasks['std'].notna() & (atomic_tasks['std'] != 0)
|
||||
atomic_tasks['z_score'] = np.nan
|
||||
atomic_tasks.loc[non_zero_std, 'z_score'] = \
|
||||
(atomic_tasks.loc[non_zero_std, 'estimate_midpoint'] - atomic_tasks.loc[non_zero_std, 'mean']) / atomic_tasks.loc[non_zero_std, 'std']
|
||||
|
||||
# Robust Z-score (using MAD)
|
||||
atomic_tasks['robust_z_score'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(_mad_z_score)
|
||||
|
||||
# --- 6. Prepare for other generators ---
|
||||
# NOTE: The data for the 'task_breakdown_by_occupation' generator, specifically
|
||||
# the 'remote_status' and 'estimateable' columns, is not available in the O*NET
|
||||
# database. This data was likely loaded from a separate file (e.g., 'tasks_clean.parquet')
|
||||
# in the original notebook. For now, we will add placeholder columns.
|
||||
atomic_tasks['remote_status'] = 'unknown'
|
||||
atomic_tasks['estimateable'] = 'unknown'
|
||||
|
||||
|
||||
logging.info("Data preprocessing complete.")
|
||||
return atomic_tasks
|
||||
|
||||
except Exception as e:
|
||||
logging.error("An error occurred during preprocessing: %s", e, exc_info=True)
|
||||
# Return an empty DataFrame on failure to prevent downstream errors
|
||||
return pd.DataFrame()
|
||||
finally:
|
||||
if conn:
|
||||
conn.close()
|
||||
logging.info("Database connection closed.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# This allows the preprocessing to be run directly for testing or debugging.
|
||||
# Note: Requires data to be set up first by running data.py.
|
||||
try:
|
||||
processed_data = run_preprocessing()
|
||||
if not processed_data.empty:
|
||||
print("Preprocessing successful. DataFrame shape:", processed_data.shape)
|
||||
print("Columns:", processed_data.columns.tolist())
|
||||
print(processed_data.head())
|
||||
# Save to a temporary file to inspect the output
|
||||
output_path = "temp_preprocessed_data.csv"
|
||||
processed_data.to_csv(output_path, index=False)
|
||||
print(f"Sample output saved to {output_path}")
|
||||
else:
|
||||
print("Preprocessing failed or resulted in an empty DataFrame.")
|
||||
|
||||
except (FileNotFoundError, ConnectionError) as e:
|
||||
logging.error("Failed to run preprocessing: %s", e)
|
Loading…
Add table
Add a link
Reference in a new issue