This commit is contained in:
Félix Dorn 2025-07-15 00:34:54 +02:00
parent 62296e1b69
commit 65dc648797
37 changed files with 1413 additions and 2433 deletions

View file

@ -1,50 +1,30 @@
"""
Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
"""
import sqlite3
from typing import Tuple
import pandas as pd
import requests
import io
import zipfile
from pipeline.run import Run
from pipeline.logger import logger
import yaml
from pathlib import Path
from .logger import logger
from typing import Tuple, Dict
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
"""
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
"""
version = "29_1"
url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
db_path = run.cache_dir / f"onet_{version}.db"
run.meta.fetchers['onet'] = {
'url': url,
'version': version,
'db_path': str(db_path),
}
ONET_VERSION = "29_1"
ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"
if db_path.exists():
logger.info(f"Using cached O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
return conn, version
def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"
logger.info(f"Downloading O*NET database from {url}")
response = requests.get(url, stream=True, headers={
if DB_PATH.exists():
logger.info(f"Using cached O*NET database: {DB_PATH}")
return sqlite3.connect(DB_PATH)
logger.info(f"Downloading O*NET database from {ONET_URL}")
response = requests.get(ONET_URL, stream=True, headers={
"User-Agent": "econ-agent/1.0"
})
response.raise_for_status()
# Read content into memory
zip_content = response.content
db_path = run.cache_dir / f"onet_{version}.db"
logger.info(f"Creating new O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set performance PRAGMAs for fast import
logger.info("Creating new SQLite database with performance settings")
conn = sqlite3.connect(DB_PATH)
conn.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
PRAGMA foreign_keys = ON;
""")
zip_content = response.content
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
sql_scripts = []
for filename in sorted(z.namelist()):
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
if not sql_scripts:
raise RuntimeError("No SQL files found in the O*NET zip archive.")
# Combine and execute all SQL files in one transaction
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
conn.executescript(full_script)
logger.info("Database populated successfully. Restoring reliability settings...")
# Restore reliability-focused settings after import
conn.executescript("""
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
""")
conn.execute("VACUUM;")
conn.commit()
logger.info("Reliability settings restored and database optimized successfully!")
return conn, version
return conn
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the OESM national data from the BLS website.
"""
version = "23"
url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
parquet_path = run.cache_dir / "oesm.parquet"
run.meta.fetchers['oesm'] = {
'url': url,
'version': version,
'parquet_path': str(parquet_path),
}
def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
VERSION = "23"
URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
DATA_PATH = cache_dir / "oesm.parquet"
if parquet_path.exists():
logger.info(f"Using cached OESM data: {parquet_path}")
return pd.read_parquet(parquet_path), version
if DATA_PATH.exists():
logger.info(f"Using cached OESM data: {DATA_PATH}")
return pd.read_parquet(DATA_PATH)
logger.info(f"Downloading OESM data from {url}")
logger.info(f"Downloading OESM data from {URL}")
headers = {'User-Agent': 'econ-agent/1.0'}
response = requests.get(url, headers=headers)
response = requests.get(URL, headers=headers)
response.raise_for_status()
zip_content = response.content
logger.info(f"OESM data version: {version}")
logger.info(f"Creating new OESM data cache: {parquet_path}")
logger.info(f"Creating new OESM data cache: {DATA_PATH}")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
# Find the excel file in the zip
excel_filename = None
for filename in z.namelist():
logger.debug(f"Found file in OESM zip: {filename}")
if filename.lower().endswith(".xlsx"):
excel_filename = filename
break
if excel_filename is None:
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
logger.info(f"Reading {excel_filename} from zip archive.")
with z.open(excel_filename) as f:
with z.open(f"oesm{VERSION}national.xlsx") as f:
df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])
df.to_parquet(parquet_path)
logger.info(f"Saved OESM data to cache: {parquet_path}")
return df, version
df.to_parquet(DATA_PATH)
logger.info(f"Saved OESM data to cache: {DATA_PATH}")
return df
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the EPOCH AI remote work task data.
"""
# This is the direct download link constructed from the Google Drive share link
version = "latest"
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
run.meta.fetchers['epoch_remote'] = {
'url': url,
'version': version,
'parquet_path': str(parquet_path),
}
def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"
if parquet_path.exists():
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
return pd.read_parquet(parquet_path), version
if DATA_PATH.exists():
logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
return pd.read_parquet(DATA_PATH)
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")
# Need to handle potential cookies/redirects from Google Drive
session = requests.Session()
session.headers.update({"User-Agent": "econ-agent/1.0"})
response = session.get(url, stream=True)
response = session.get(URL, stream=True)
response.raise_for_status()
csv_content = response.content
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
df = pd.read_csv(io.BytesIO(csv_content))
df.to_parquet(parquet_path)
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
df.to_parquet(DATA_PATH)
return df, version
return df
def fetch_metr_data(cache_dir: Path) -> Dict:
URL = "https://metr.org/assets/benchmark_results.yaml"
DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
if DATA_PATH.exists():
logger.info(f"Using cached METR data: {DATA_PATH}")
with open(DATA_PATH, "r") as f:
return yaml.safe_load(f)
logger.info(f"Downloading METR data from {URL}")
headers = {"User-Agent": "econ-agent/1.0"}
response = requests.get(URL, headers=headers)
response.raise_for_status()
yaml_content = response.content
logger.info(f"Creating new METR data cache: {DATA_PATH}")
with open(DATA_PATH, "wb") as f:
f.write(yaml_content)
return yaml.safe_load(yaml_content)