progress

2025-07-03 17:32:41 +02:00 · 2025-07-03 17:32:41 +02:00 · b7c94590f9
commit b7c94590f9
parent 2da206d368
14 changed files with 2200 additions and 13 deletions
--- a/pipeline/enrichments.py
+++ b/pipeline/enrichments.py
@ -7,6 +7,7 @@ from  .run  import Run
 import pandas as pd

 def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
+    run.metadata.
    raise NotImplementedError

 def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
--- a/pipeline/fetchers.py
+++ b/pipeline/fetchers.py
@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro
 import sqlite3
 from typing import Tuple
 import pandas as pd
-from .metadata import Metadata
+import requests
+import hashlib
+import io
+import zipfile
+from .run import Run
+from .logger import logger

-def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]:
-    raise NotImplementedError
+def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
+    """
+    Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
+    The version is the sha256 of the downloaded zip file.
+    """
+    url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
+    logger.info(f"Downloading O*NET database from {url}")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()

-def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
-    raise NotImplementedError
+    # Read content into memory
+    zip_content = response.content
+    version = hashlib.sha256(zip_content).hexdigest()
+    logger.info(f"O*NET database version (sha256): {version}")

-def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
-    raise NotImplementedError
+    db_path = run.cache_dir / f"onet_{version}.db"
+
+    if db_path.exists():
+        logger.info(f"Using cached O*NET database: {db_path}")
+        conn = sqlite3.connect(db_path)
+        # Set PRAGMA for foreign keys on every connection
+        conn.execute("PRAGMA foreign_keys = ON;")
+        return conn, version
+
+    logger.info(f"Creating new O*NET database: {db_path}")
+    conn = sqlite3.connect(db_path)
+
+    # Set performance PRAGMAs for fast import
+    logger.info("Creating new SQLite database with performance settings")
+    conn.executescript("""
+        PRAGMA journal_mode = OFF;
+        PRAGMA synchronous = 0;
+        PRAGMA cache_size = 1000000;
+        PRAGMA locking_mode = EXCLUSIVE;
+        PRAGMA temp_store = MEMORY;
+        PRAGMA foreign_keys = ON;
+    """)
+
+    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
+        sql_scripts = []
+        for filename in sorted(z.namelist()):
+            if filename.endswith(".sql"):
+                sql_scripts.append(z.read(filename).decode('utf-8'))
+
+        if not sql_scripts:
+            raise RuntimeError("No SQL files found in the O*NET zip archive.")
+
+        # Combine and execute all SQL files in one transaction
+        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
+
+        logger.info("Executing SQL files in alphabetical order (single transaction mode)")
+        conn.executescript(full_script)
+        logger.info("Database populated successfully. Restoring reliability settings...")
+
+    # Restore reliability-focused settings after import
+    conn.executescript("""
+        PRAGMA journal_mode = WAL;
+        PRAGMA synchronous = NORMAL;
+        PRAGMA locking_mode = NORMAL;
+        PRAGMA temp_store = DEFAULT;
+        PRAGMA foreign_keys = ON;
+        PRAGMA optimize;
+    """)
+    conn.execute("VACUUM;")
+    conn.commit()
+    logger.info("Reliability settings restored and database optimized successfully!")
+
+    return conn, version
+
+def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
+    """
+    Downloads the OESM national data from the BLS website.
+    The version is the sha256 of the downloaded zip file.
+    """
+    url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
+    logger.info(f"Downloading OESM data from {url}")
+    response = requests.get(url)
+    response.raise_for_status()
+
+    zip_content = response.content
+    version = hashlib.sha256(zip_content).hexdigest()
+    logger.info(f"OESM data version (sha256): {version}")
+
+    parquet_path = run.cache_dir / f"oesm_{version}.parquet"
+    if parquet_path.exists():
+        logger.info(f"Using cached OESM data: {parquet_path}")
+        return pd.read_parquet(parquet_path), version
+
+    logger.info(f"Creating new OESM data cache: {parquet_path}")
+    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
+        # Find the excel file in the zip
+        excel_filename = None
+        for filename in z.namelist():
+            logger.debug(f"Found file in OESM zip: {filename}")
+            if filename.lower().endswith(".xlsx"):
+                excel_filename = filename
+                break
+
+        if excel_filename is None:
+            raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
+
+        logger.info(f"Reading {excel_filename} from zip archive.")
+        with z.open(excel_filename) as f:
+            df = pd.read_excel(f, engine='openpyxl')
+
+    df.to_parquet(parquet_path)
+    logger.info(f"Saved OESM data to cache: {parquet_path}")
+    return df, version
+
+def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
+    """
+    Downloads the EPOCH AI remote work task data.
+    The version is the sha256 of the downloaded CSV file.
+    """
+    # This is the direct download link constructed from the Google Drive share link
+    url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
+    logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
+
+    # Need to handle potential cookies/redirects from Google Drive
+    session = requests.Session()
+    response = session.get(url, stream=True)
+    response.raise_for_status()
+
+    csv_content = response.content
+    version = hashlib.sha256(csv_content).hexdigest()
+    logger.info(f"EPOCH remote data version (sha256): {version}")
+
+    parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
+    if parquet_path.exists():
+        logger.info(f"Using cached EPOCH remote data: {parquet_path}")
+        return pd.read_parquet(parquet_path), version
+
+    logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
+    df = pd.read_csv(io.BytesIO(csv_content))
+    df.to_parquet(parquet_path)
+    logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
+
+    return df, version
--- a/pipeline/generators/estimate_histplot.py
+++ b/pipeline/generators/estimate_histplot.py
@ -2,5 +2,5 @@ from ..run import Run
 from pathlib import Path
 from typing import Generator

-def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]:
+def generate_estimate_histplot(run: Run) -> Generator[Path]:
    raise NotImplementedError
--- a/pipeline/logger.py
+++ b/pipeline/logger.py
@ -0,0 +1,24 @@
+import logging
+from logging.handlers import RotatingFileHandler
+from rich.logging import RichHandler
+
+LOGGER_NAME = "pipeline"
+
+def setup_logging() -> logging.Logger:
+    # Set up Rich console handler
+    rich_handler = RichHandler(
+        level=logging.DEBUG,
+        show_time=True,
+        enable_link_path=True,
+        rich_tracebacks=True,
+        # omit_repeated_times=False,
+    )
+
+    logger = logging.getLogger(LOGGER_NAME)
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(rich_handler)
+
+    return logger
+
+
+logger = setup_logging()
--- a/pipeline/metadata.py
+++ b/pipeline/metadata.py
@ -16,6 +16,7 @@ class Metadata(BaseModel):
    versions, and other important information.
    """
    fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
+    enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)

    ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    commit: str = Field(default_factory=lambda: _get_current_commit())
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -1,6 +1,7 @@
 from pydantic import BaseModel, Field
 import sqlite3
 import pandas as pd
+from pathlib import Path
 from typing import Optional
 from .metadata import Metadata

@ -20,3 +21,6 @@ class Run(BaseModel):
    task_estimates_df: Optional[pd.DataFrame] = None

    meta: Metadata = Field(default_factory=Metadata)
+
+    cache_dir: Path
+    output_dir: Path
--- a/pipeline/runner.py
+++ b/pipeline/runner.py
@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks
 from .generators import GENERATORS
 from .run import Run
 from .constants import GRAY
+import platformdirs
 import seaborn as sns
 import matplotlib as mpl
 from pathlib import Path
 from typings import Optional

+CACHE_DIR = platformdirs.user_cache_dir("econtai")
+
 def run(output_dir: Optional[str] = None):
    if output_dir is None:
        output_dir = Path(".")
@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None):
    load_dotenv()
    _setup_graph_rendering()

-    current_run = Run()
+    current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)

    # Fetchers (fetchers.py)
-    current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta)
-    current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta)
-    current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta)
+    current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
+    current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
+    current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)

    # Enrichments (enrichments.py)
    current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None):

    # Generators (generators/)
    for gen in GENERATORS:
-        gen(current_run, output_dir)
+        gen(current_run)


 def _setup_graph_rendering():