This commit is contained in:
Félix Dorn 2025-07-03 19:40:35 +02:00
parent b7c94590f9
commit f9f9825abb
9 changed files with 941 additions and 42 deletions

View file

@ -6,36 +6,40 @@ import sqlite3
from typing import Tuple
import pandas as pd
import requests
import hashlib
import io
import zipfile
from .run import Run
from .logger import logger
from pipeline.run import Run
from pipeline.logger import logger
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
"""
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
The version is the sha256 of the downloaded zip file.
"""
url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
logger.info(f"Downloading O*NET database from {url}")
response = requests.get(url, stream=True)
response.raise_for_status()
# Read content into memory
zip_content = response.content
version = hashlib.sha256(zip_content).hexdigest()
logger.info(f"O*NET database version (sha256): {version}")
version = "29_1"
url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
db_path = run.cache_dir / f"onet_{version}.db"
run.meta.fetchers['onet'] = {
'url': url,
'version': version,
'db_path': str(db_path),
}
if db_path.exists():
logger.info(f"Using cached O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set PRAGMA for foreign keys on every connection
conn.execute("PRAGMA foreign_keys = ON;")
return conn, version
logger.info(f"Downloading O*NET database from {url}")
response = requests.get(url, stream=True, headers={
"User-Agent": "econ-agent/1.0"
})
response.raise_for_status()
# Read content into memory
zip_content = response.content
db_path = run.cache_dir / f"onet_{version}.db"
logger.info(f"Creating new O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
@ -84,22 +88,28 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the OESM national data from the BLS website.
The version is the sha256 of the downloaded zip file.
"""
url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
logger.info(f"Downloading OESM data from {url}")
response = requests.get(url)
response.raise_for_status()
version = "23"
url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
parquet_path = run.cache_dir / "oesm.parquet"
run.meta.fetchers['oesm'] = {
'url': url,
'version': version,
'parquet_path': str(parquet_path),
}
zip_content = response.content
version = hashlib.sha256(zip_content).hexdigest()
logger.info(f"OESM data version (sha256): {version}")
parquet_path = run.cache_dir / f"oesm_{version}.parquet"
if parquet_path.exists():
logger.info(f"Using cached OESM data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Downloading OESM data from {url}")
headers = {'User-Agent': 'econ-agent/1.0'}
response = requests.get(url, headers=headers)
response.raise_for_status()
zip_content = response.content
logger.info(f"OESM data version: {version}")
logger.info(f"Creating new OESM data cache: {parquet_path}")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
# Find the excel file in the zip
@ -115,7 +125,7 @@ def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
logger.info(f"Reading {excel_filename} from zip archive.")
with z.open(excel_filename) as f:
df = pd.read_excel(f, engine='openpyxl')
df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])
df.to_parquet(parquet_path)
logger.info(f"Saved OESM data to cache: {parquet_path}")
@ -124,25 +134,30 @@ def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the EPOCH AI remote work task data.
The version is the sha256 of the downloaded CSV file.
"""
# This is the direct download link constructed from the Google Drive share link
version = "latest"
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
run.meta.fetchers['epoch_remote'] = {
'url': url,
'version': version,
'parquet_path': str(parquet_path),
}
if parquet_path.exists():
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
# Need to handle potential cookies/redirects from Google Drive
session = requests.Session()
session.headers.update({"User-Agent": "econ-agent/1.0"})
response = session.get(url, stream=True)
response.raise_for_status()
csv_content = response.content
version = hashlib.sha256(csv_content).hexdigest()
logger.info(f"EPOCH remote data version (sha256): {version}")
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
if parquet_path.exists():
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
df = pd.read_csv(io.BytesIO(csv_content))