This commit is contained in:
Félix Dorn 2025-07-03 17:32:41 +02:00
parent 2da206d368
commit b7c94590f9
14 changed files with 2200 additions and 13 deletions

View file

@ -7,6 +7,7 @@ from .run import Run
import pandas as pd
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
run.metadata.
raise NotImplementedError
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:

View file

@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro
import sqlite3
from typing import Tuple
import pandas as pd
from .metadata import Metadata
import requests
import hashlib
import io
import zipfile
from .run import Run
from .logger import logger
def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]:
raise NotImplementedError
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
"""
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
The version is the sha256 of the downloaded zip file.
"""
url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
logger.info(f"Downloading O*NET database from {url}")
response = requests.get(url, stream=True)
response.raise_for_status()
def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
raise NotImplementedError
# Read content into memory
zip_content = response.content
version = hashlib.sha256(zip_content).hexdigest()
logger.info(f"O*NET database version (sha256): {version}")
def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
raise NotImplementedError
db_path = run.cache_dir / f"onet_{version}.db"
if db_path.exists():
logger.info(f"Using cached O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set PRAGMA for foreign keys on every connection
conn.execute("PRAGMA foreign_keys = ON;")
return conn, version
logger.info(f"Creating new O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set performance PRAGMAs for fast import
logger.info("Creating new SQLite database with performance settings")
conn.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
PRAGMA foreign_keys = ON;
""")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
sql_scripts = []
for filename in sorted(z.namelist()):
if filename.endswith(".sql"):
sql_scripts.append(z.read(filename).decode('utf-8'))
if not sql_scripts:
raise RuntimeError("No SQL files found in the O*NET zip archive.")
# Combine and execute all SQL files in one transaction
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
conn.executescript(full_script)
logger.info("Database populated successfully. Restoring reliability settings...")
# Restore reliability-focused settings after import
conn.executescript("""
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA locking_mode = NORMAL;
PRAGMA temp_store = DEFAULT;
PRAGMA foreign_keys = ON;
PRAGMA optimize;
""")
conn.execute("VACUUM;")
conn.commit()
logger.info("Reliability settings restored and database optimized successfully!")
return conn, version
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the OESM national data from the BLS website.
The version is the sha256 of the downloaded zip file.
"""
url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
logger.info(f"Downloading OESM data from {url}")
response = requests.get(url)
response.raise_for_status()
zip_content = response.content
version = hashlib.sha256(zip_content).hexdigest()
logger.info(f"OESM data version (sha256): {version}")
parquet_path = run.cache_dir / f"oesm_{version}.parquet"
if parquet_path.exists():
logger.info(f"Using cached OESM data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Creating new OESM data cache: {parquet_path}")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
# Find the excel file in the zip
excel_filename = None
for filename in z.namelist():
logger.debug(f"Found file in OESM zip: {filename}")
if filename.lower().endswith(".xlsx"):
excel_filename = filename
break
if excel_filename is None:
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
logger.info(f"Reading {excel_filename} from zip archive.")
with z.open(excel_filename) as f:
df = pd.read_excel(f, engine='openpyxl')
df.to_parquet(parquet_path)
logger.info(f"Saved OESM data to cache: {parquet_path}")
return df, version
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the EPOCH AI remote work task data.
The version is the sha256 of the downloaded CSV file.
"""
# This is the direct download link constructed from the Google Drive share link
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
# Need to handle potential cookies/redirects from Google Drive
session = requests.Session()
response = session.get(url, stream=True)
response.raise_for_status()
csv_content = response.content
version = hashlib.sha256(csv_content).hexdigest()
logger.info(f"EPOCH remote data version (sha256): {version}")
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
if parquet_path.exists():
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
df = pd.read_csv(io.BytesIO(csv_content))
df.to_parquet(parquet_path)
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
return df, version

View file

@ -2,5 +2,5 @@ from ..run import Run
from pathlib import Path
from typing import Generator
def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]:
def generate_estimate_histplot(run: Run) -> Generator[Path]:
raise NotImplementedError

24
pipeline/logger.py Normal file
View file

@ -0,0 +1,24 @@
import logging
from logging.handlers import RotatingFileHandler
from rich.logging import RichHandler
LOGGER_NAME = "pipeline"
def setup_logging() -> logging.Logger:
# Set up Rich console handler
rich_handler = RichHandler(
level=logging.DEBUG,
show_time=True,
enable_link_path=True,
rich_tracebacks=True,
# omit_repeated_times=False,
)
logger = logging.getLogger(LOGGER_NAME)
logger.setLevel(logging.DEBUG)
logger.addHandler(rich_handler)
return logger
logger = setup_logging()

View file

@ -16,6 +16,7 @@ class Metadata(BaseModel):
versions, and other important information.
"""
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
commit: str = Field(default_factory=lambda: _get_current_commit())

View file

@ -1,6 +1,7 @@
from pydantic import BaseModel, Field
import sqlite3
import pandas as pd
from pathlib import Path
from typing import Optional
from .metadata import Metadata
@ -20,3 +21,6 @@ class Run(BaseModel):
task_estimates_df: Optional[pd.DataFrame] = None
meta: Metadata = Field(default_factory=Metadata)
cache_dir: Path
output_dir: Path

View file

@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks
from .generators import GENERATORS
from .run import Run
from .constants import GRAY
import platformdirs
import seaborn as sns
import matplotlib as mpl
from pathlib import Path
from typings import Optional
CACHE_DIR = platformdirs.user_cache_dir("econtai")
def run(output_dir: Optional[str] = None):
if output_dir is None:
output_dir = Path(".")
@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None):
load_dotenv()
_setup_graph_rendering()
current_run = Run()
current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)
# Fetchers (fetchers.py)
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta)
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta)
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta)
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
# Enrichments (enrichments.py)
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None):
# Generators (generators/)
for gen in GENERATORS:
gen(current_run, output_dir)
gen(current_run)
def _setup_graph_rendering():