progress
This commit is contained in:
parent
2da206d368
commit
b7c94590f9
14 changed files with 2200 additions and 13 deletions
|
@ -7,6 +7,7 @@ from .run import Run
|
|||
import pandas as pd
|
||||
|
||||
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
||||
run.metadata.
|
||||
raise NotImplementedError
|
||||
|
||||
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
||||
|
|
|
@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro
|
|||
import sqlite3
|
||||
from typing import Tuple
|
||||
import pandas as pd
|
||||
from .metadata import Metadata
|
||||
import requests
|
||||
import hashlib
|
||||
import io
|
||||
import zipfile
|
||||
from .run import Run
|
||||
from .logger import logger
|
||||
|
||||
def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]:
|
||||
raise NotImplementedError
|
||||
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
||||
"""
|
||||
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
|
||||
The version is the sha256 of the downloaded zip file.
|
||||
"""
|
||||
url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
|
||||
logger.info(f"Downloading O*NET database from {url}")
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
|
||||
raise NotImplementedError
|
||||
# Read content into memory
|
||||
zip_content = response.content
|
||||
version = hashlib.sha256(zip_content).hexdigest()
|
||||
logger.info(f"O*NET database version (sha256): {version}")
|
||||
|
||||
def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
|
||||
raise NotImplementedError
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
|
||||
if db_path.exists():
|
||||
logger.info(f"Using cached O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
# Set PRAGMA for foreign keys on every connection
|
||||
conn.execute("PRAGMA foreign_keys = ON;")
|
||||
return conn, version
|
||||
|
||||
logger.info(f"Creating new O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
# Set performance PRAGMAs for fast import
|
||||
logger.info("Creating new SQLite database with performance settings")
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
PRAGMA cache_size = 1000000;
|
||||
PRAGMA locking_mode = EXCLUSIVE;
|
||||
PRAGMA temp_store = MEMORY;
|
||||
PRAGMA foreign_keys = ON;
|
||||
""")
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
sql_scripts = []
|
||||
for filename in sorted(z.namelist()):
|
||||
if filename.endswith(".sql"):
|
||||
sql_scripts.append(z.read(filename).decode('utf-8'))
|
||||
|
||||
if not sql_scripts:
|
||||
raise RuntimeError("No SQL files found in the O*NET zip archive.")
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
|
||||
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
|
||||
conn.executescript(full_script)
|
||||
logger.info("Database populated successfully. Restoring reliability settings...")
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
PRAGMA locking_mode = NORMAL;
|
||||
PRAGMA temp_store = DEFAULT;
|
||||
PRAGMA foreign_keys = ON;
|
||||
PRAGMA optimize;
|
||||
""")
|
||||
conn.execute("VACUUM;")
|
||||
conn.commit()
|
||||
logger.info("Reliability settings restored and database optimized successfully!")
|
||||
|
||||
return conn, version
|
||||
|
||||
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the OESM national data from the BLS website.
|
||||
The version is the sha256 of the downloaded zip file.
|
||||
"""
|
||||
url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
|
||||
logger.info(f"Downloading OESM data from {url}")
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
zip_content = response.content
|
||||
version = hashlib.sha256(zip_content).hexdigest()
|
||||
logger.info(f"OESM data version (sha256): {version}")
|
||||
|
||||
parquet_path = run.cache_dir / f"oesm_{version}.parquet"
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached OESM data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
|
||||
logger.info(f"Creating new OESM data cache: {parquet_path}")
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
# Find the excel file in the zip
|
||||
excel_filename = None
|
||||
for filename in z.namelist():
|
||||
logger.debug(f"Found file in OESM zip: {filename}")
|
||||
if filename.lower().endswith(".xlsx"):
|
||||
excel_filename = filename
|
||||
break
|
||||
|
||||
if excel_filename is None:
|
||||
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
|
||||
|
||||
logger.info(f"Reading {excel_filename} from zip archive.")
|
||||
with z.open(excel_filename) as f:
|
||||
df = pd.read_excel(f, engine='openpyxl')
|
||||
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved OESM data to cache: {parquet_path}")
|
||||
return df, version
|
||||
|
||||
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the EPOCH AI remote work task data.
|
||||
The version is the sha256 of the downloaded CSV file.
|
||||
"""
|
||||
# This is the direct download link constructed from the Google Drive share link
|
||||
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
|
||||
|
||||
# Need to handle potential cookies/redirects from Google Drive
|
||||
session = requests.Session()
|
||||
response = session.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_content = response.content
|
||||
version = hashlib.sha256(csv_content).hexdigest()
|
||||
logger.info(f"EPOCH remote data version (sha256): {version}")
|
||||
|
||||
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
|
||||
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
|
||||
df = pd.read_csv(io.BytesIO(csv_content))
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
|
||||
|
||||
return df, version
|
||||
|
|
|
@ -2,5 +2,5 @@ from ..run import Run
|
|||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]:
|
||||
def generate_estimate_histplot(run: Run) -> Generator[Path]:
|
||||
raise NotImplementedError
|
||||
|
|
24
pipeline/logger.py
Normal file
24
pipeline/logger.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from rich.logging import RichHandler
|
||||
|
||||
LOGGER_NAME = "pipeline"
|
||||
|
||||
def setup_logging() -> logging.Logger:
|
||||
# Set up Rich console handler
|
||||
rich_handler = RichHandler(
|
||||
level=logging.DEBUG,
|
||||
show_time=True,
|
||||
enable_link_path=True,
|
||||
rich_tracebacks=True,
|
||||
# omit_repeated_times=False,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(LOGGER_NAME)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.addHandler(rich_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
logger = setup_logging()
|
|
@ -16,6 +16,7 @@ class Metadata(BaseModel):
|
|||
versions, and other important information.
|
||||
"""
|
||||
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
commit: str = Field(default_factory=lambda: _get_current_commit())
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from pydantic import BaseModel, Field
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from .metadata import Metadata
|
||||
|
||||
|
@ -20,3 +21,6 @@ class Run(BaseModel):
|
|||
task_estimates_df: Optional[pd.DataFrame] = None
|
||||
|
||||
meta: Metadata = Field(default_factory=Metadata)
|
||||
|
||||
cache_dir: Path
|
||||
output_dir: Path
|
||||
|
|
|
@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks
|
|||
from .generators import GENERATORS
|
||||
from .run import Run
|
||||
from .constants import GRAY
|
||||
import platformdirs
|
||||
import seaborn as sns
|
||||
import matplotlib as mpl
|
||||
from pathlib import Path
|
||||
from typings import Optional
|
||||
|
||||
CACHE_DIR = platformdirs.user_cache_dir("econtai")
|
||||
|
||||
def run(output_dir: Optional[str] = None):
|
||||
if output_dir is None:
|
||||
output_dir = Path(".")
|
||||
|
@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None):
|
|||
load_dotenv()
|
||||
_setup_graph_rendering()
|
||||
|
||||
current_run = Run()
|
||||
current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)
|
||||
|
||||
# Fetchers (fetchers.py)
|
||||
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta)
|
||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta)
|
||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta)
|
||||
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
|
||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
|
||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
|
||||
|
||||
# Enrichments (enrichments.py)
|
||||
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
|
||||
|
@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None):
|
|||
|
||||
# Generators (generators/)
|
||||
for gen in GENERATORS:
|
||||
gen(current_run, output_dir)
|
||||
gen(current_run)
|
||||
|
||||
|
||||
def _setup_graph_rendering():
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue