wip

2025-07-15 00:34:54 +02:00 · 2025-07-15 00:34:54 +02:00 · 65dc648797
commit 65dc648797
parent 62296e1b69
37 changed files with 1413 additions and 2433 deletions
--- a/pipeline/fetchers.py
+++ b/pipeline/fetchers.py
@ -1,50 +1,30 @@
-"""
-Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
-"""
-
 import sqlite3
-from typing import Tuple
 import pandas as pd
 import requests
 import io
 import zipfile
-from pipeline.run import Run
-from pipeline.logger import logger
+import yaml
+from pathlib import Path
+from .logger import logger
+from typing import Tuple, Dict

-def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
-    """
-    Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
-    """
-    version  = "29_1"
-    url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
-    db_path = run.cache_dir / f"onet_{version}.db"
-    run.meta.fetchers['onet'] = {
-        'url': url,
-        'version': version,
-        'db_path': str(db_path),
-    }
+ONET_VERSION  = "29_1"
+ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"

-    if db_path.exists():
-        logger.info(f"Using cached O*NET database: {db_path}")
-        conn = sqlite3.connect(db_path)
-        return conn, version
+def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
+    DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"

-    logger.info(f"Downloading O*NET database from {url}")
-    response = requests.get(url, stream=True, headers={
+    if DB_PATH.exists():
+        logger.info(f"Using cached O*NET database: {DB_PATH}")
+        return sqlite3.connect(DB_PATH)
+
+    logger.info(f"Downloading O*NET database from {ONET_URL}")
+    response = requests.get(ONET_URL, stream=True, headers={
        "User-Agent": "econ-agent/1.0"
    })
    response.raise_for_status()

-    # Read content into memory
-    zip_content = response.content
-
-    db_path = run.cache_dir / f"onet_{version}.db"
-
-    logger.info(f"Creating new O*NET database: {db_path}")
-    conn = sqlite3.connect(db_path)
-
-    # Set performance PRAGMAs for fast import
-    logger.info("Creating new SQLite database with performance settings")
+    conn = sqlite3.connect(DB_PATH)
    conn.executescript("""
        PRAGMA journal_mode = OFF;
        PRAGMA synchronous = 0;
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
        PRAGMA foreign_keys = ON;
    """)

+    zip_content = response.content
    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
        sql_scripts = []
        for filename in sorted(z.namelist()):
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
        if not sql_scripts:
            raise RuntimeError("No SQL files found in the O*NET zip archive.")

-        # Combine and execute all SQL files in one transaction
-        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
-
        logger.info("Executing SQL files in alphabetical order (single transaction mode)")
+        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
        conn.executescript(full_script)
-        logger.info("Database populated successfully. Restoring reliability settings...")

-    # Restore reliability-focused settings after import
    conn.executescript("""
        PRAGMA journal_mode = WAL;
        PRAGMA synchronous = NORMAL;
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
    """)
    conn.execute("VACUUM;")
    conn.commit()
-    logger.info("Reliability settings restored and database optimized successfully!")

-    return conn, version
+    return conn

-def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
-    """
-    Downloads the OESM national data from the BLS website.
-    """
-    version = "23"
-    url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
-    parquet_path = run.cache_dir / "oesm.parquet"
-    run.meta.fetchers['oesm'] = {
-        'url': url,
-        'version': version,
-        'parquet_path': str(parquet_path),
-    }
+def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
+    VERSION = "23"
+    URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
+    DATA_PATH = cache_dir / "oesm.parquet"

-    if parquet_path.exists():
-        logger.info(f"Using cached OESM data: {parquet_path}")
-        return pd.read_parquet(parquet_path), version
+    if DATA_PATH.exists():
+        logger.info(f"Using cached OESM data: {DATA_PATH}")
+        return pd.read_parquet(DATA_PATH)

-    logger.info(f"Downloading OESM data from {url}")
+    logger.info(f"Downloading OESM data from {URL}")
    headers = {'User-Agent': 'econ-agent/1.0'}
-    response = requests.get(url, headers=headers)
+    response = requests.get(URL, headers=headers)
    response.raise_for_status()

    zip_content = response.content
-    logger.info(f"OESM data version: {version}")

-    logger.info(f"Creating new OESM data cache: {parquet_path}")
+    logger.info(f"Creating new OESM data cache: {DATA_PATH}")
    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
-        # Find the excel file in the zip
-        excel_filename = None
-        for filename in z.namelist():
-            logger.debug(f"Found file in OESM zip: {filename}")
-            if filename.lower().endswith(".xlsx"):
-                excel_filename = filename
-                break
-
-        if excel_filename is None:
-            raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
-
-        logger.info(f"Reading {excel_filename} from zip archive.")
-        with z.open(excel_filename) as f:
+        with z.open(f"oesm{VERSION}national.xlsx") as f:
            df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])

-    df.to_parquet(parquet_path)
-    logger.info(f"Saved OESM data to cache: {parquet_path}")
-    return df, version
+    df.to_parquet(DATA_PATH)
+    logger.info(f"Saved OESM data to cache: {DATA_PATH}")
+    return df

-def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
-    """
-    Downloads the EPOCH AI remote work task data.
-    """
-    # This is the direct download link constructed from the Google Drive share link
-    version = "latest"
-    url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
-    parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
-    run.meta.fetchers['epoch_remote'] = {
-        'url': url,
-        'version': version,
-        'parquet_path': str(parquet_path),
-    }
+def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
+    URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
+    DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"

-    if parquet_path.exists():
-        logger.info(f"Using cached EPOCH remote data: {parquet_path}")
-        return pd.read_parquet(parquet_path), version
+    if DATA_PATH.exists():
+        logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
+        return pd.read_parquet(DATA_PATH)

-    logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
+    logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")

-    # Need to handle potential cookies/redirects from Google Drive
    session = requests.Session()
    session.headers.update({"User-Agent": "econ-agent/1.0"})
-    response = session.get(url, stream=True)
+    response = session.get(URL, stream=True)
    response.raise_for_status()

    csv_content = response.content

-    logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
+    logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
    df = pd.read_csv(io.BytesIO(csv_content))
-    df.to_parquet(parquet_path)
-    logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
+    df.to_parquet(DATA_PATH)

-    return df, version
+    return df
+
+def fetch_metr_data(cache_dir: Path) -> Dict:
+    URL = "https://metr.org/assets/benchmark_results.yaml"
+    DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
+
+    if DATA_PATH.exists():
+        logger.info(f"Using cached METR data: {DATA_PATH}")
+        with open(DATA_PATH, "r") as f:
+            return yaml.safe_load(f)
+
+    logger.info(f"Downloading METR data from {URL}")
+    headers = {"User-Agent": "econ-agent/1.0"}
+    response = requests.get(URL, headers=headers)
+    response.raise_for_status()
+
+    yaml_content = response.content
+
+    logger.info(f"Creating new METR data cache: {DATA_PATH}")
+    with open(DATA_PATH, "wb") as f:
+        f.write(yaml_content)
+
+    return yaml.safe_load(yaml_content)