This commit is contained in:
Félix Dorn 2025-07-15 00:41:05 +02:00
parent 720f21a85b
commit 43076bcbb1
42 changed files with 237415 additions and 7831 deletions

0
analysis/__init__.py Normal file
View file

207
analysis/data.py Normal file
View file

@ -0,0 +1,207 @@
import logging
import re
import requests
import shutil
import sqlite3
import zipfile
from pathlib import Path
# Configure logging to provide feedback during the data setup process
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# --- Constants ---
# Using a data directory at the root of the project
DATA_DIR = Path("data")
# O*NET database details. We download the MySQL version and convert it to SQLite.
ONET_MYSQL_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_mysql.zip"
DB_ZIP_PATH = DATA_DIR / "onet_mysql.zip"
DB_FILE_PATH = DATA_DIR / "onet.db"
EXTRACT_DIR = DATA_DIR / "onet_mysql_extracted"
# URLs for other required data files are in a separate text data archive.
ONET_TEXT_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_text.zip"
TEXT_ZIP_PATH = DATA_DIR / "onet_text.zip"
TASK_RATINGS_PATH = DATA_DIR / "Task Ratings.txt"
DWA_REFERENCE_PATH = DATA_DIR / "DWA Reference.txt"
def setup_data_and_database():
"""
Main function to orchestrate the data setup.
It ensures the data directory exists, then downloads and sets up the O*NET database
and any other required data files.
"""
logging.info("Starting data and database setup...")
DATA_DIR.mkdir(exist_ok=True)
_setup_onet_database()
_download_additional_data()
logging.info("Data and database setup complete.")
def _setup_onet_database():
"""
Downloads the O*NET MySQL database, extracts it, and imports it into a
new SQLite database, following performance best practices from a shell script.
This method performs minimal text-based conversion of the MySQL dump to
make it compatible with SQLite before importing.
"""
if DB_FILE_PATH.exists():
logging.info("O*NET database already exists at %s. Skipping setup.", DB_FILE_PATH)
return
logging.info("O*NET database not found. Starting fresh setup.")
# Ensure the extraction directory is clean before use
if EXTRACT_DIR.exists():
shutil.rmtree(EXTRACT_DIR)
EXTRACT_DIR.mkdir()
try:
# 1. Download if necessary
if not DB_ZIP_PATH.exists():
logging.info("Downloading O*NET database from %s", ONET_MYSQL_URL)
_download_file(ONET_MYSQL_URL, DB_ZIP_PATH)
else:
logging.info("Using existing O*NET zip file at %s", DB_ZIP_PATH)
# 2. Extract
logging.info("Extracting O*NET database files to %s", EXTRACT_DIR)
with zipfile.ZipFile(DB_ZIP_PATH, 'r') as zip_ref:
zip_ref.extractall(EXTRACT_DIR)
# 3. Create new DB with performance PRAGMAs
logging.info("Creating new SQLite database with performance settings: %s", DB_FILE_PATH)
conn = sqlite3.connect(DB_FILE_PATH)
conn.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
""")
conn.close()
# 4. Combine all SQL files, convert, and import in a single transaction
logging.info("Combining and converting SQL files for single transaction import...")
sql_files = sorted(EXTRACT_DIR.rglob('*.sql'))
if not sql_files:
raise FileNotFoundError(f"No SQL files found in {EXTRACT_DIR}")
# Concatenate all files into one string
mysql_dump = "\n".join([sql_file.read_text(encoding='utf-8') for sql_file in sql_files])
# Minimal conversion for SQLite: remove backticks and ENGINE clauses
sqlite_dump = mysql_dump.replace('`', '')
sqlite_dump = re.sub(r'\) ENGINE=InnoDB.*?;', ');', sqlite_dump, flags=re.DOTALL)
full_script = f"BEGIN TRANSACTION;\n{sqlite_dump}\nCOMMIT;"
logging.info(f"Importing {len(sql_files)} SQL files into database...")
conn = sqlite3.connect(DB_FILE_PATH)
conn.executescript(full_script)
conn.close()
logging.info("Database populated successfully.")
# 5. Restore reliability settings and optimize
logging.info("Restoring reliability settings and optimizing database...")
conn = sqlite3.connect(DB_FILE_PATH)
conn.executescript("""
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA locking_mode = NORMAL;
PRAGMA temp_store = DEFAULT;
PRAGMA foreign_keys = ON;
PRAGMA optimize;
""")
conn.execute("VACUUM;")
conn.close()
logging.info("Database setup and optimization complete.")
except Exception as e:
logging.error("Failed during database setup: %s", e, exc_info=True)
if DB_FILE_PATH.exists():
DB_FILE_PATH.unlink()
raise
finally:
# 6. Cleanup
logging.info("Cleaning up temporary files...")
if DB_ZIP_PATH.exists():
DB_ZIP_PATH.unlink()
if EXTRACT_DIR.exists():
shutil.rmtree(EXTRACT_DIR)
def _download_additional_data():
"""
Downloads and extracts supplementary data files from the O*NET text archive.
If the required text files already exist, this function does nothing.
"""
required_files = [TASK_RATINGS_PATH, DWA_REFERENCE_PATH]
if all(p.exists() for p in required_files):
logging.info("All required text data files already exist. Skipping download.")
return
logging.info("One or more text data files are missing. Downloading and extracting from archive...")
try:
_download_file(ONET_TEXT_URL, TEXT_ZIP_PATH)
logging.info("Unzipping text data archive...")
with zipfile.ZipFile(TEXT_ZIP_PATH, 'r') as zip_ref:
# Extract only the files we need, without creating subdirectories
for target_path in required_files:
if not target_path.exists():
# Find the corresponding file within the zip archive's directory structure
member_name = next((m for m in zip_ref.namelist() if m.endswith(target_path.name)), None)
if member_name:
with zip_ref.open(member_name) as source, open(target_path, 'wb') as target:
target.write(source.read())
logging.info("Extracted %s", target_path.name)
else:
logging.warning("Could not find %s in the text data archive.", target_path.name)
except requests.exceptions.RequestException as e:
logging.error("Failed to download O*NET text data archive: %s", e)
raise
except zipfile.BadZipFile as e:
logging.error("Failed to process the text data archive: %s", e)
raise
finally:
# Clean up the downloaded zip file
if TEXT_ZIP_PATH.exists():
TEXT_ZIP_PATH.unlink()
logging.info("Cleaned up downloaded text archive zip file.")
def _download_file(url, destination):
"""
Helper function to download a file from a URL, with streaming for large files.
"""
logging.info("Downloading from %s to %s", url, destination)
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(destination, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logging.info("Download of %s complete.", destination.name)
def get_db_connection():
"""
Establishes and returns a connection to the SQLite database.
Returns None if the database file does not exist.
"""
if not DB_FILE_PATH.exists():
logging.error("Database file not found at %s. Run the setup process first.", DB_FILE_PATH)
return None
try:
conn = sqlite3.connect(DB_FILE_PATH)
return conn
except sqlite3.Error as e:
logging.error("Failed to connect to the database: %s", e)
return None
if __name__ == '__main__':
# This allows the data setup to be run directly from the command line,
# which is useful for initialization or debugging.
setup_data_and_database()

76
analysis/generate.py Normal file
View file

@ -0,0 +1,76 @@
import importlib
import logging
import pkgutil
import shutil
from pathlib import Path
# The final destination for all generated outputs
DIST_DIR = Path("dist")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def create_all_outputs(processed_df):
"""
Dynamically discovers, imports, and runs all output generators.
This function iterates through all modules in the 'analysis.generators'
package. For each module, it assumes there is a 'generate(data)' function,
which it calls with the provided preprocessed DataFrame.
The generator function is expected to save its output to a temporary file
and return the path to that file. This function then moves the output
to the 'dist/' directory.
Args:
processed_df (pd.DataFrame): The fully preprocessed data to be used
by the generator functions.
"""
logging.info("Starting output generation...")
DIST_DIR.mkdir(exist_ok=True)
logging.info(f"Output directory is '{DIST_DIR.resolve()}'")
# Path to the generators package
from . import generators as generators_package
generators_path = generators_package.__path__
generators_prefix = generators_package.__name__ + "."
generated_files_count = 0
# Discover and run all modules in the generators package
for _, module_name, _ in pkgutil.iter_modules(generators_path, prefix=generators_prefix):
try:
logging.info(f"--- Running generator: {module_name} ---")
# Import the generator module
generator_module = importlib.import_module(module_name)
# Check if the module has the required 'generate' function
if not hasattr(generator_module, 'generate'):
logging.warning(f"Generator module {module_name} does not have a 'generate' function. Skipping.")
continue
# Call the generator function, passing in the preprocessed data
generator_func = getattr(generator_module, 'generate')
temp_output_path = generator_func(processed_df)
# If the generator returned a path, move the file to the dist directory
if temp_output_path and isinstance(temp_output_path, Path) and temp_output_path.exists():
# Sanitize the module name to create a valid filename
base_filename = module_name.split('.')[-1]
# Keep the original extension from the temp file
final_filename = base_filename + temp_output_path.suffix
final_output_path = DIST_DIR / final_filename
shutil.move(temp_output_path, final_output_path)
logging.info(f"Successfully generated '{final_output_path.name}'")
generated_files_count += 1
else:
logging.warning(f"Generator {module_name} did not return a valid output file path. Nothing was saved.")
except Exception as e:
logging.error(f"Failed to run generator {module_name}. Error: {e}", exc_info=True)
# Continue to the next generator
logging.info(f"--- Output generation complete. Total files generated: {generated_files_count} ---")

View file

View file

@ -0,0 +1,119 @@
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
import numpy as np
# Copied from other generators for modularity. This dictionary maps
# O*NET major occupation group codes to human-readable labels.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a scatter plot comparing lower vs. upper time estimates for tasks.
This corresponds to 'cell3' from the original analysis notebook. It helps
visualize the relationship and spread between the lower and upper bounds
of time estimates across different occupation groups.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes', 'onetsoc_major'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating plot of lower vs. upper time estimates...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# For log scaling, both lower and upper bounds must be positive.
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
if df.empty:
logging.warning("No data with positive lower and upper estimates available to plot.")
return None
# Replace the major code with its readable label for the hue legend.
df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
# --- Plotting ---
try:
plt.figure(figsize=(12, 10))
ax = sns.scatterplot(
data=df,
x='lb_estimate_in_minutes',
y='ub_estimate_in_minutes',
alpha=0.2,
edgecolor=None,
hue="occupation_label" # Use the labeled column for the legend
)
# Determine limits for the 45° reference line
# Use the maximum of both columns to create a square plot
max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
# Add helper lines for constant ratios (2x, 10x, 100x)
for k in [2, 10, 100]:
ax.plot(lims, [k * l for l in lims],
linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
# Place the legend outside the plot to avoid obscuring data
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
# Use bbox_inches='tight' to ensure the external legend is included in the saved image.
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,86 @@
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import logging
def generate(processed_df: pd.DataFrame):
"""
Generates a histogram of the log-ratio of upper to lower time estimates.
This corresponds to 'cell4' from the original analysis notebook. It shows
the distribution of how many times larger the upper estimate is compared
to the lower estimate.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating distribution plot of estimate ratios...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the ratio. We need to handle cases where the lower bound is zero.
# Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
# Here, we filter, as a ratio with a zero denominator is undefined.
df = df[df['lb_estimate_in_minutes'] > 0]
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
# Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
# and drop rows with NaN or infinite ratios.
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['estimate_ratio'], inplace=True)
if df.empty:
logging.warning("No valid data available to plot the estimate ratio distribution.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(10, 6))
# We plot the log10 of the ratio to better visualize the wide distribution
log_ratio = np.log10(df['estimate_ratio'])
sns.histplot(log_ratio, bins=60, kde=True)
# Add vertical lines for reference points
# log10(1) = 0, which is where upper bound equals lower bound
plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
# A small ratio, e.g., 5% difference
plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
# A 10x ratio
plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
plt.ylabel('Number of Tasks', fontsize=12)
plt.title('Distribution of Time Estimate Ratios', fontsize=16)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,135 @@
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import tempfile
import logging
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a heatmap of the median estimate ratio by occupation and task length quartile.
This corresponds to 'cell5' from the original analysis notebook. It shows
how the ratio between upper and lower time estimates varies across
different occupations and for tasks of different typical lengths (binned
into quartiles).
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes', 'onetsoc_major'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating heatmap of estimate ratios by occupation and task length...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the estimate ratio, handling division by zero and infinity
df = df[df['lb_estimate_in_minutes'] > 0]
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['estimate_ratio'], inplace=True)
if df.empty:
logging.warning("No valid data available for the ratio heatmap.")
return None
# 1. Bin lower bounds into quartiles (Q1Q4)
# Using duplicates='drop' can help if there are many identical values
# which can make binning into quantiles fail.
try:
df['lb_q'] = pd.qcut(
df.lb_estimate_in_minutes,
q=4,
labels=['Q1 (Shortest)', 'Q2', 'Q3', 'Q4 (Longest)'],
duplicates='drop'
)
except ValueError as e:
logging.error(f"Could not bin data into quartiles: {e}. There might not be enough unique values.")
return None
# 2. Aggregate: median ratio per cell (occupation x task length quartile)
pivot = df.pivot_table(
index='onetsoc_major',
columns='lb_q',
values='estimate_ratio',
aggfunc='median'
)
# Map the index (onetsoc_major codes) to their corresponding readable labels
pivot.index = pivot.index.map(OCCUPATION_MAJOR_CODES)
pivot.dropna(inplace=True) # Drop occupations with no data in some quartiles for a cleaner plot
if pivot.empty:
logging.warning("Pivot table is empty after processing. Cannot generate heatmap.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(12, 10))
sns.heatmap(
pivot,
cmap='RdYlGn_r', # Red-Yellow-Green (reversed), good for ratios centered around 1
center=2, # Center the colormap around a ratio of 2
annot=True, # Show the median values in the cells
fmt='.1f', # Format annotations to one decimal place
linewidths=.5,
cbar_kws={'label': 'Median Upper/Lower Estimate Ratio'}
)
plt.xlabel('Task Length (based on lower-bound quartile)', fontsize=12)
plt.ylabel('Occupation Major Group', fontsize=12)
plt.title('Typical Estimate Range Width by Occupation and Task Length', fontsize=16)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "ratio_heatmap_by_occupation_and_task_length.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the heatmap: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,161 @@
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.colors as mcolors
from pathlib import Path
import tempfile
import logging
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
# Define colors to match the original notebook's palette.
# These are standard hex codes for gray and lime shades.
BAR_COLORS = [
'#D1D5DB', # gray-300
'#84CC16', # lime-500
'#D9F99D', # lime-200
]
def _get_contrasting_text_color(bg_color_hex):
"""
Determines if black or white text provides better contrast against a given background color.
"""
try:
rgba = mcolors.to_rgba(bg_color_hex)
# Calculate luminance (Y) using the sRGB formula
luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
return 'black' if luminance > 0.55 else 'white'
except ValueError:
return 'black' # Default to black if color is invalid
def generate(processed_df: pd.DataFrame):
"""
Generates a stacked bar chart breaking down tasks by remote status and estimability.
This corresponds to 'cell10' from the original analysis notebook. It shows,
for each occupation, the percentage of tasks that are not remote, remote and
estimable, or remote and not estimable.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'onetsoc_major', 'remote_status', 'estimateable'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating task breakdown by occupation plot...")
# --- Data Validation ---
required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# --- Data Summarization ---
summary_data = []
for code, label in OCCUPATION_MAJOR_CODES.items():
occ_df = df[df['onetsoc_major'] == code]
total_tasks = len(occ_df)
if total_tasks == 0:
continue
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
remote_df = occ_df[occ_df['remote_status'] == 'remote']
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
summary_data.append({
'occupation_label': label,
'count_not_remote': not_remote_count,
'count_remote_atomic': remote_atomic_count,
'count_remote_ongoing': remote_ongoing_count,
'total_tasks': total_tasks
})
if not summary_data:
logging.warning("No data available to generate the task breakdown plot.")
return None
summary_df = pd.DataFrame(summary_data)
# --- Percentage Calculation ---
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
plot_df = summary_df.set_index('occupation_label')[
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
]
plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
# --- Plotting ---
try:
fig, ax = plt.subplots(figsize=(14, 10))
plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
ax.set_xlabel("Percentage of Tasks", fontsize=12)
ax.set_ylabel("Occupation Major Group", fontsize=12)
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_xlim(0, 100)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Add percentage labels inside each bar segment
for i, container in enumerate(ax.containers):
text_color = _get_contrasting_text_color(BAR_COLORS[i])
for patch in container.patches:
width = patch.get_width()
if width > 3: # Only label segments wider than 3%
x = patch.get_x() + width / 2
y = patch.get_y() + patch.get_height() / 2
ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
fontsize=8, color=text_color, fontweight='medium')
ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,74 @@
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
def generate(processed_df: pd.DataFrame):
"""
Generates a histogram of the task time estimate midpoints.
This generator corresponds to 'cell1' from the original analysis notebook.
It visualizes the distribution of the calculated midpoint of time estimates
for all tasks on a logarithmic scale to handle the wide range of values.
Args:
processed_df (pd.DataFrame): The preprocessed data, expected to contain
'lb_estimate_in_minutes' and
'ub_estimate_in_minutes' columns.
Returns:
Path: The path to the generated temporary image file, or None if
generation fails.
"""
logging.info("Generating task estimate distribution plot...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(
f"Required columns {required_cols} not found in the DataFrame. "
"Cannot generate plot."
)
return None
# Create a copy to avoid modifying the original DataFrame
df = processed_df.copy()
# Calculate the midpoint from lower and upper bounds, as was done in the notebook
df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
# For log scaling, we must use positive values. Filter out any non-positive midpoints.
df = df[df['estimate_midpoint'] > 0]
if df.empty:
logging.warning("No data with positive estimate midpoints available to plot.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(10, 6))
ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)
ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
ax.set_ylabel('Number of Tasks', fontsize=12)
plt.tight_layout()
# --- File Saving ---
# Create a temporary file to save the plot. The orchestrator (`generate.py`)
# will move this to the final 'dist/' directory.
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "task_estimate_distribution.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
# Close the figure to free up memory, which is crucial when running many generators.
plt.close()

View file

@ -0,0 +1,134 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
import tempfile
import logging
# Replicating the color palette from the original notebook for consistency.
# These appear to be inspired by Tailwind CSS colors.
GRAY_PALETTE = {
'100': '#F3F4F6',
'300': '#D1D5DB',
}
LIME_PALETTE = {
'300': '#D9F99D',
'600': '#A3E635', # A mid-tone lime
'900': '#4D7C0F', # A dark lime/green
}
def _calculate_cdf(series: pd.Series):
"""
Calculates the empirical Cumulative Distribution Function (CDF) for a series.
Returns the sorted values and their corresponding cumulative percentages.
"""
# Drop NA values and ensure the series is sorted
s = series.dropna().sort_values().reset_index(drop=True)
# Calculate cumulative percentage: (index + 1) / total_count
cdf_y = ((s.index + 1) / len(s)) * 100
return s.values, cdf_y
def generate(processed_df: pd.DataFrame):
"""
Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
This corresponds to the second 'cell11' from the original notebook. It plots
the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
showing the percentage of tasks that can be completed within a certain time.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating temporal coherence CDF plot...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Log scale requires positive values.
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
if df.empty:
logging.warning("No data with positive estimates available to generate CDF plot.")
return None
# Calculate mid-point estimate
df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
# Prepare data for CDF plots
x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
# --- Plotting ---
try:
fig, ax = plt.subplots(figsize=(12, 8))
# --- Grid and Reference Lines ---
# Horizontal reference lines for percentages
for y_val in range(0, 101, 10):
ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
# Vertical reference lines for human-friendly durations
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
for tick in ticks:
ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
# --- CDF Plots ---
ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
# --- Axes Configuration ---
ax.set_ylim(0, 100)
ax.set_xscale('log')
# Custom x-ticks for durations
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
# Format y-axis as percentages
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
# --- Spines and Labels ---
for spine in ['top', 'right']:
ax.spines[spine].set_visible(False)
for spine in ['left', 'bottom']:
ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
# Use ax.text for more control over label placement than ax.set_ylabel/xlabel
ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
fontsize=12, fontweight='semibold', va='bottom')
ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
fontsize=12, fontweight='semibold', ha='center')
ax.legend(frameon=False, loc='lower right')
fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,112 @@
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
# Based on O*NET SOC 2018 structure, this mapping helps translate
# the 2-digit major group codes into human-readable labels.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a box plot showing the spread of time-range estimates per occupation.
This corresponds to 'cell2' from the original analysis notebook. It visualizes
the distribution of the difference between upper and lower time estimates for
each major occupational group.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes', 'onetsoc_major'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating plot of time estimate spread by occupation...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the estimate range.
df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
# For log scaling, we need positive values. Filter out any non-positive ranges.
df = df[df['estimate_range'] > 0]
if df.empty:
logging.warning("No data with a positive estimate range available to plot.")
return None
# Sort by the major code to ensure a consistent plot order
df = df.sort_values('onetsoc_major')
# --- Plotting ---
try:
plt.figure(figsize=(14, 10))
ax = sns.boxplot(
data=df,
x='onetsoc_major',
y='estimate_range',
showfliers=False # Outliers are excluded for a clearer view of the main distribution
)
plt.yscale('log') # The long tail of the data makes a log scale more readable
plt.xlabel('Occupation Major Group', fontsize=12)
plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
# Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
ax.set_xticklabels(
[OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
rotation=60,
ha='right' # Align rotated labels correctly
)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,150 @@
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
from pathlib import Path
import tempfile
import logging
# Assuming data.py is in the same package and provides this function
from ..data import get_db_connection
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a bar plot of the total wage bill per major occupation group.
This corresponds to the first 'cell11' from the original analysis notebook.
It calculates the total wage bill (Total Employment * Annual Mean Wage) for
each occupation and aggregates it by major occupation group. This generator
loads its data directly from the O*NET database.
Args:
processed_df (pd.DataFrame): The preprocessed data (not used in this generator,
but required by the function signature).
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating plot of total wage bill by occupation...")
conn = None
try:
# --- Data Loading ---
# This generator needs specific data that is not in the main preprocessed_df.
# It loads occupational employment and wage data directly from the database.
conn = get_db_connection()
if conn is None:
raise ConnectionError("Could not get database connection.")
# This data is stored in a long format in the `occupation_level_metadata` table.
# We need to query this table and pivot it to get employment and wage columns.
query = "SELECT onetsoc_code, item, response FROM occupation_level_metadata WHERE item IN ('Employment', 'Annual Mean Wage')"
try:
df_meta = pd.read_sql_query(query, conn)
# Pivot the table to create 'Employment' and 'Annual Mean Wage' columns
df_oesm = df_meta.pivot(index='onetsoc_code', columns='item', values='response').reset_index()
logging.info("Pivoted occupation metadata. Columns are: %s", df_oesm.columns.tolist())
# Rename for consistency with the original notebook's code
df_oesm.rename(columns={
'onetsoc_code': 'OCC_CODE',
'Employment': 'TOT_EMP',
'Annual Mean Wage': 'A_MEAN'
}, inplace=True)
except (pd.io.sql.DatabaseError, KeyError) as e:
logging.error(f"Failed to query or pivot occupation metadata: {e}", exc_info=True)
return None
# --- Data Preparation ---
# Create a 'major group' code from the first two digits of the SOC code
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
# Ensure wage and employment columns are numeric, coercing errors to NaN
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
# Drop rows with missing data in critical columns
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
# Calculate the wage bill for each occupation
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
# Aggregate the wage bill by major occupation group
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
# Map the major codes to readable titles for plotting
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
df_wage_bill_major.dropna(subset=['OCC_TITLE_MAJOR'], inplace=True) # Drop military/unmapped codes
# Sort by wage bill for a more informative plot
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
if df_wage_bill_major.empty:
logging.warning("No data available to generate the wage bill plot.")
return None
# --- Plotting ---
plt.figure(figsize=(12, 10))
ax = sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis", orient='h')
ax.set_title('Total Wage Bill per Major Occupation Group', fontsize=16, pad=15)
ax.set_xlabel('Total Wage Bill (in USD)', fontsize=12)
ax.set_ylabel('Major Occupation Group', fontsize=12)
ax.grid(axis='x', linestyle='--', alpha=0.7)
# Format the x-axis to be more readable (e.g., "$2.0T" for trillions)
def format_billions(x, pos):
if x >= 1e12:
return f'${x*1e-12:.1f}T'
if x >= 1e9:
return f'${x*1e-9:.0f}B'
return f'${x*1e-6:.0f}M'
ax.xaxis.set_major_formatter(mticker.FuncFormatter(format_billions))
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "wage_bill_by_occupation.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the wage bill plot: {e}", exc_info=True)
return None
finally:
plt.close()
if conn:
conn.close()

64
analysis/main.py Normal file
View file

@ -0,0 +1,64 @@
import logging
import sys
# Since this file is inside the 'analysis' package, we use relative imports
# to access the other modules within the same package.
from . import data
from . import preprocess
from . import generate
# Configure logging for the entire application.
# This setup will apply to loggers in data, preprocess, and generate modules as well.
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
stream=sys.stdout
)
def main():
"""
The main entry point for the entire analysis pipeline.
This function orchestrates the three main stages of the analysis:
1. Data Setup: Downloads and prepares the necessary raw data and database.
2. Preprocessing: Cleans, enriches, and transforms the raw data into an
analysis-ready DataFrame.
3. Output Generation: Runs all registered generators to produce figures,
tables, and other outputs, saving them to the 'dist/' directory.
"""
logger = logging.getLogger(__name__)
logger.info("=================================================")
logger.info(" STARTING ECONTAI ANALYSIS PIPELINE ")
logger.info("=================================================")
try:
# Stage 1: Set up the data and database
logger.info("--- STAGE 1: DATA SETUP ---")
data.setup_data_and_database()
logger.info("--- DATA SETUP COMPLETE ---")
# Stage 2: Run the preprocessing pipeline
logger.info("--- STAGE 2: PREPROCESSING ---")
processed_dataframe = preprocess.run_preprocessing()
logger.info("--- PREPROCESSING COMPLETE ---")
# Stage 3: Generate all outputs
logger.info("--- STAGE 3: OUTPUT GENERATION ---")
generate.create_all_outputs(processed_dataframe)
logger.info("--- OUTPUT GENERATION COMPLETE ---")
logger.info("=================================================")
logger.info(" ANALYSIS PIPELINE COMPLETED SUCCESSFULLY ")
logger.info("=================================================")
except Exception as e:
logger.critical("An unrecoverable error occurred during the pipeline execution.", exc_info=True)
# Exit with a non-zero status code to indicate failure, which is useful for automation.
sys.exit(1)
# This allows the script to be run from the command line using `python -m analysis.main`.
# The `-m` flag is important because it adds the parent directory to the Python path,
# allowing the relative imports (e.g., `from . import data`) to work correctly.
if __name__ == '__main__':
main()

160
analysis/preprocess.py Normal file
View file

@ -0,0 +1,160 @@
import logging
import pandas as pd
import numpy as np
from scipy.stats import median_abs_deviation
from .data import get_db_connection
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def _convert_to_minutes(level: float) -> float:
"""
Converts O*NET 'Frequency' scale values (levels) to estimated minutes per day.
This logic is derived from the `preprocessing_time_estimates` function
in the original analysis notebook.
"""
if pd.isna(level):
return 0
# This mapping is an interpretation of the O*NET frequency scale.
return {
1: 0, # Yearly or less
2: 2, # Several times a year
3: 10, # Several times a month
4: 30, # Several times a week
5: 120, # Daily
6: 240, # Several times a day
7: 480, # Hourly or more
}.get(int(level), 0)
def _mad_z_score(series: pd.Series) -> pd.Series:
"""
Calculates the robust Z-score using Median Absolute Deviation (MAD).
This function is derived from 'cell7' of the original analysis.
"""
if series.isnull().all():
return pd.Series([np.nan] * len(series), index=series.index)
median = series.median()
# scale='normal' makes MAD comparable to the standard deviation for a normal distribution.
mad = median_abs_deviation(series.dropna(), scale='normal')
if mad == 0:
return pd.Series([np.nan] * len(series), index=series.index)
return (series - median) / mad
def run_preprocessing() -> pd.DataFrame:
"""
Main orchestrator for the preprocessing pipeline.
This function faithfully reproduces the data transformation pipeline from the
original `analysis.py` script, including the `preprocessing_time_estimates`
and cell-specific data manipulations.
Returns:
pd.DataFrame: A fully preprocessed DataFrame ready for the generators.
"""
logging.info("Starting data preprocessing...")
conn = None
try:
conn = get_db_connection()
if conn is None:
raise ConnectionError("Could not establish database connection.")
# --- 1. Load Data from Database ---
# Fetch all necessary tables to build the initial DataFrame.
logging.info("Loading data from O*NET database...")
task_ratings_df = pd.read_sql_query("SELECT * FROM task_ratings", conn)
task_statements_df = pd.read_sql_query("SELECT * FROM task_statements", conn)
occupations_df = pd.read_sql_query("SELECT * FROM occupation_data", conn)
# --- 2. Initial Merge ---
# Merge the tables to create a comprehensive base DataFrame.
# Merging on both 'onetsoc_code' and 'task_id' is crucial to avoid
# creating duplicate columns from the overlapping 'onetsoc_code'.
logging.info("Merging base tables...")
tasks_df = pd.merge(task_ratings_df, task_statements_df, on=['onetsoc_code', 'task_id'])
tasks_df = pd.merge(tasks_df, occupations_df, on='onetsoc_code')
# --- 3. Create "Atomic Tasks" and Time Estimates (from `preprocessing_time_estimates`) ---
# This is the core of the analysis, focusing on tasks with frequency ratings.
logging.info("Filtering for 'atomic tasks' (scale_id='FR') and calculating time estimates...")
# Strip whitespace from scale_id to ensure the filter works correctly.
tasks_df['scale_id'] = tasks_df['scale_id'].str.strip()
atomic_tasks = tasks_df[tasks_df['scale_id'] == 'FR'].copy()
# Convert frequency confidence intervals into minutes/day
atomic_tasks['lb_estimate_in_minutes'] = atomic_tasks['lower_ci_bound'].apply(_convert_to_minutes)
atomic_tasks['ub_estimate_in_minutes'] = atomic_tasks['upper_ci_bound'].apply(_convert_to_minutes)
atomic_tasks['estimate_midpoint'] = (atomic_tasks['lb_estimate_in_minutes'] + atomic_tasks['ub_estimate_in_minutes']) / 2
# --- 4. Add Derived Columns for Analysis (from `cell` logic) ---
logging.info("Adding derived columns for analysis...")
# Add `onetsoc_major` for grouping by occupation category
atomic_tasks['onetsoc_major'] = atomic_tasks['onetsoc_code'].str[:2]
# Calculate estimate_range and estimate_ratio used in several plots
atomic_tasks['estimate_range'] = atomic_tasks['ub_estimate_in_minutes'] - atomic_tasks['lb_estimate_in_minutes']
# To calculate ratio, ensure lower bound is positive to avoid division by zero
lb_positive = atomic_tasks['lb_estimate_in_minutes'] > 0
atomic_tasks['estimate_ratio'] = np.nan
atomic_tasks.loc[lb_positive, 'estimate_ratio'] = atomic_tasks['ub_estimate_in_minutes'] / atomic_tasks['lb_estimate_in_minutes']
# --- 5. Calculate Outlier Scores (from `cell6` and `cell7`) ---
logging.info("Calculating standard and robust Z-scores for outlier detection...")
# Standard Z-score
grouped_stats = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].agg(['mean', 'std'])
atomic_tasks = atomic_tasks.merge(grouped_stats, on='onetsoc_code', how='left')
# Calculate Z-score, avoiding division by zero if std is 0
non_zero_std = atomic_tasks['std'].notna() & (atomic_tasks['std'] != 0)
atomic_tasks['z_score'] = np.nan
atomic_tasks.loc[non_zero_std, 'z_score'] = \
(atomic_tasks.loc[non_zero_std, 'estimate_midpoint'] - atomic_tasks.loc[non_zero_std, 'mean']) / atomic_tasks.loc[non_zero_std, 'std']
# Robust Z-score (using MAD)
atomic_tasks['robust_z_score'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(_mad_z_score)
# --- 6. Prepare for other generators ---
# NOTE: The data for the 'task_breakdown_by_occupation' generator, specifically
# the 'remote_status' and 'estimateable' columns, is not available in the O*NET
# database. This data was likely loaded from a separate file (e.g., 'tasks_clean.parquet')
# in the original notebook. For now, we will add placeholder columns.
atomic_tasks['remote_status'] = 'unknown'
atomic_tasks['estimateable'] = 'unknown'
logging.info("Data preprocessing complete.")
return atomic_tasks
except Exception as e:
logging.error("An error occurred during preprocessing: %s", e, exc_info=True)
# Return an empty DataFrame on failure to prevent downstream errors
return pd.DataFrame()
finally:
if conn:
conn.close()
logging.info("Database connection closed.")
if __name__ == '__main__':
# This allows the preprocessing to be run directly for testing or debugging.
# Note: Requires data to be set up first by running data.py.
try:
processed_data = run_preprocessing()
if not processed_data.empty:
print("Preprocessing successful. DataFrame shape:", processed_data.shape)
print("Columns:", processed_data.columns.tolist())
print(processed_data.head())
# Save to a temporary file to inspect the output
output_path = "temp_preprocessed_data.csv"
processed_data.to_csv(output_path, index=False)
print(f"Sample output saved to {output_path}")
else:
print("Preprocessing failed or resulted in an empty DataFrame.")
except (FileNotFoundError, ConnectionError) as e:
logging.error("Failed to run preprocessing: %s", e)