wip
This commit is contained in:
parent
62296e1b69
commit
65dc648797
37 changed files with 1413 additions and 2433 deletions
|
@ -1,50 +1,30 @@
|
|||
"""
|
||||
Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
from typing import Tuple
|
||||
import pandas as pd
|
||||
import requests
|
||||
import io
|
||||
import zipfile
|
||||
from pipeline.run import Run
|
||||
from pipeline.logger import logger
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from .logger import logger
|
||||
from typing import Tuple, Dict
|
||||
|
||||
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
||||
"""
|
||||
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
|
||||
"""
|
||||
version = "29_1"
|
||||
url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
run.meta.fetchers['onet'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'db_path': str(db_path),
|
||||
}
|
||||
ONET_VERSION = "29_1"
|
||||
ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"
|
||||
|
||||
if db_path.exists():
|
||||
logger.info(f"Using cached O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
return conn, version
|
||||
def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
|
||||
DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"
|
||||
|
||||
logger.info(f"Downloading O*NET database from {url}")
|
||||
response = requests.get(url, stream=True, headers={
|
||||
if DB_PATH.exists():
|
||||
logger.info(f"Using cached O*NET database: {DB_PATH}")
|
||||
return sqlite3.connect(DB_PATH)
|
||||
|
||||
logger.info(f"Downloading O*NET database from {ONET_URL}")
|
||||
response = requests.get(ONET_URL, stream=True, headers={
|
||||
"User-Agent": "econ-agent/1.0"
|
||||
})
|
||||
response.raise_for_status()
|
||||
|
||||
# Read content into memory
|
||||
zip_content = response.content
|
||||
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
|
||||
logger.info(f"Creating new O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
# Set performance PRAGMAs for fast import
|
||||
logger.info("Creating new SQLite database with performance settings")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
|
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
PRAGMA foreign_keys = ON;
|
||||
""")
|
||||
|
||||
zip_content = response.content
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
sql_scripts = []
|
||||
for filename in sorted(z.namelist()):
|
||||
|
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
if not sql_scripts:
|
||||
raise RuntimeError("No SQL files found in the O*NET zip archive.")
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
|
||||
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
conn.executescript(full_script)
|
||||
logger.info("Database populated successfully. Restoring reliability settings...")
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
|
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
""")
|
||||
conn.execute("VACUUM;")
|
||||
conn.commit()
|
||||
logger.info("Reliability settings restored and database optimized successfully!")
|
||||
|
||||
return conn, version
|
||||
return conn
|
||||
|
||||
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the OESM national data from the BLS website.
|
||||
"""
|
||||
version = "23"
|
||||
url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
|
||||
parquet_path = run.cache_dir / "oesm.parquet"
|
||||
run.meta.fetchers['oesm'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'parquet_path': str(parquet_path),
|
||||
}
|
||||
def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
|
||||
VERSION = "23"
|
||||
URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
|
||||
DATA_PATH = cache_dir / "oesm.parquet"
|
||||
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached OESM data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached OESM data: {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info(f"Downloading OESM data from {url}")
|
||||
logger.info(f"Downloading OESM data from {URL}")
|
||||
headers = {'User-Agent': 'econ-agent/1.0'}
|
||||
response = requests.get(url, headers=headers)
|
||||
response = requests.get(URL, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
zip_content = response.content
|
||||
logger.info(f"OESM data version: {version}")
|
||||
|
||||
logger.info(f"Creating new OESM data cache: {parquet_path}")
|
||||
logger.info(f"Creating new OESM data cache: {DATA_PATH}")
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
# Find the excel file in the zip
|
||||
excel_filename = None
|
||||
for filename in z.namelist():
|
||||
logger.debug(f"Found file in OESM zip: {filename}")
|
||||
if filename.lower().endswith(".xlsx"):
|
||||
excel_filename = filename
|
||||
break
|
||||
|
||||
if excel_filename is None:
|
||||
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
|
||||
|
||||
logger.info(f"Reading {excel_filename} from zip archive.")
|
||||
with z.open(excel_filename) as f:
|
||||
with z.open(f"oesm{VERSION}national.xlsx") as f:
|
||||
df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])
|
||||
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved OESM data to cache: {parquet_path}")
|
||||
return df, version
|
||||
df.to_parquet(DATA_PATH)
|
||||
logger.info(f"Saved OESM data to cache: {DATA_PATH}")
|
||||
return df
|
||||
|
||||
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the EPOCH AI remote work task data.
|
||||
"""
|
||||
# This is the direct download link constructed from the Google Drive share link
|
||||
version = "latest"
|
||||
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
|
||||
run.meta.fetchers['epoch_remote'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'parquet_path': str(parquet_path),
|
||||
}
|
||||
def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
|
||||
URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"
|
||||
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")
|
||||
|
||||
# Need to handle potential cookies/redirects from Google Drive
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "econ-agent/1.0"})
|
||||
response = session.get(url, stream=True)
|
||||
response = session.get(URL, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_content = response.content
|
||||
|
||||
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
|
||||
logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
|
||||
df = pd.read_csv(io.BytesIO(csv_content))
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
|
||||
df.to_parquet(DATA_PATH)
|
||||
|
||||
return df, version
|
||||
return df
|
||||
|
||||
def fetch_metr_data(cache_dir: Path) -> Dict:
|
||||
URL = "https://metr.org/assets/benchmark_results.yaml"
|
||||
DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
|
||||
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached METR data: {DATA_PATH}")
|
||||
with open(DATA_PATH, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
logger.info(f"Downloading METR data from {URL}")
|
||||
headers = {"User-Agent": "econ-agent/1.0"}
|
||||
response = requests.get(URL, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
yaml_content = response.content
|
||||
|
||||
logger.info(f"Creating new METR data cache: {DATA_PATH}")
|
||||
with open(DATA_PATH, "wb") as f:
|
||||
f.write(yaml_content)
|
||||
|
||||
return yaml.safe_load(yaml_content)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue