progress
This commit is contained in:
		
							parent
							
								
									2da206d368
								
							
						
					
					
						commit
						b7c94590f9
					
				
					 14 changed files with 2200 additions and 13 deletions
				
			
		|  | @ -7,6 +7,7 @@ from  .run  import Run | |||
| import pandas as pd | ||||
| 
 | ||||
| def enrich_with_task_estimateability(run: Run) -> pd.DataFrame: | ||||
|     run.metadata. | ||||
|     raise NotImplementedError | ||||
| 
 | ||||
| def enrich_with_task_estimates(run: Run) -> pd.DataFrame: | ||||
|  |  | |||
|  | @ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro | |||
| import sqlite3 | ||||
| from typing import Tuple | ||||
| import pandas as pd | ||||
| from .metadata import Metadata | ||||
| import requests | ||||
| import hashlib | ||||
| import io | ||||
| import zipfile | ||||
| from .run import Run | ||||
| from .logger import logger | ||||
| 
 | ||||
| def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]: | ||||
|     raise NotImplementedError | ||||
| def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: | ||||
|     """ | ||||
|     Downloads the O*NET database, creates a local SQLite file from it, and returns a connection. | ||||
|     The version is the sha256 of the downloaded zip file. | ||||
|     """ | ||||
|     url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" | ||||
|     logger.info(f"Downloading O*NET database from {url}") | ||||
|     response = requests.get(url, stream=True) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
| def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]: | ||||
|     raise NotImplementedError | ||||
|     # Read content into memory | ||||
|     zip_content = response.content | ||||
|     version = hashlib.sha256(zip_content).hexdigest() | ||||
|     logger.info(f"O*NET database version (sha256): {version}") | ||||
| 
 | ||||
| def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]: | ||||
|     raise NotImplementedError | ||||
|     db_path = run.cache_dir / f"onet_{version}.db" | ||||
| 
 | ||||
|     if db_path.exists(): | ||||
|         logger.info(f"Using cached O*NET database: {db_path}") | ||||
|         conn = sqlite3.connect(db_path) | ||||
|         # Set PRAGMA for foreign keys on every connection | ||||
|         conn.execute("PRAGMA foreign_keys = ON;") | ||||
|         return conn, version | ||||
| 
 | ||||
|     logger.info(f"Creating new O*NET database: {db_path}") | ||||
|     conn = sqlite3.connect(db_path) | ||||
| 
 | ||||
|     # Set performance PRAGMAs for fast import | ||||
|     logger.info("Creating new SQLite database with performance settings") | ||||
|     conn.executescript(""" | ||||
|         PRAGMA journal_mode = OFF; | ||||
|         PRAGMA synchronous = 0; | ||||
|         PRAGMA cache_size = 1000000; | ||||
|         PRAGMA locking_mode = EXCLUSIVE; | ||||
|         PRAGMA temp_store = MEMORY; | ||||
|         PRAGMA foreign_keys = ON; | ||||
|     """) | ||||
| 
 | ||||
|     with zipfile.ZipFile(io.BytesIO(zip_content)) as z: | ||||
|         sql_scripts = [] | ||||
|         for filename in sorted(z.namelist()): | ||||
|             if filename.endswith(".sql"): | ||||
|                 sql_scripts.append(z.read(filename).decode('utf-8')) | ||||
| 
 | ||||
|         if not sql_scripts: | ||||
|             raise RuntimeError("No SQL files found in the O*NET zip archive.") | ||||
| 
 | ||||
|         # Combine and execute all SQL files in one transaction | ||||
|         full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" | ||||
| 
 | ||||
|         logger.info("Executing SQL files in alphabetical order (single transaction mode)") | ||||
|         conn.executescript(full_script) | ||||
|         logger.info("Database populated successfully. Restoring reliability settings...") | ||||
| 
 | ||||
|     # Restore reliability-focused settings after import | ||||
|     conn.executescript(""" | ||||
|         PRAGMA journal_mode = WAL; | ||||
|         PRAGMA synchronous = NORMAL; | ||||
|         PRAGMA locking_mode = NORMAL; | ||||
|         PRAGMA temp_store = DEFAULT; | ||||
|         PRAGMA foreign_keys = ON; | ||||
|         PRAGMA optimize; | ||||
|     """) | ||||
|     conn.execute("VACUUM;") | ||||
|     conn.commit() | ||||
|     logger.info("Reliability settings restored and database optimized successfully!") | ||||
| 
 | ||||
|     return conn, version | ||||
| 
 | ||||
| def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]: | ||||
|     """ | ||||
|     Downloads the OESM national data from the BLS website. | ||||
|     The version is the sha256 of the downloaded zip file. | ||||
|     """ | ||||
|     url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip" | ||||
|     logger.info(f"Downloading OESM data from {url}") | ||||
|     response = requests.get(url) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
|     zip_content = response.content | ||||
|     version = hashlib.sha256(zip_content).hexdigest() | ||||
|     logger.info(f"OESM data version (sha256): {version}") | ||||
| 
 | ||||
|     parquet_path = run.cache_dir / f"oesm_{version}.parquet" | ||||
|     if parquet_path.exists(): | ||||
|         logger.info(f"Using cached OESM data: {parquet_path}") | ||||
|         return pd.read_parquet(parquet_path), version | ||||
| 
 | ||||
|     logger.info(f"Creating new OESM data cache: {parquet_path}") | ||||
|     with zipfile.ZipFile(io.BytesIO(zip_content)) as z: | ||||
|         # Find the excel file in the zip | ||||
|         excel_filename = None | ||||
|         for filename in z.namelist(): | ||||
|             logger.debug(f"Found file in OESM zip: {filename}") | ||||
|             if filename.lower().endswith(".xlsx"): | ||||
|                 excel_filename = filename | ||||
|                 break | ||||
| 
 | ||||
|         if excel_filename is None: | ||||
|             raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.") | ||||
| 
 | ||||
|         logger.info(f"Reading {excel_filename} from zip archive.") | ||||
|         with z.open(excel_filename) as f: | ||||
|             df = pd.read_excel(f, engine='openpyxl') | ||||
| 
 | ||||
|     df.to_parquet(parquet_path) | ||||
|     logger.info(f"Saved OESM data to cache: {parquet_path}") | ||||
|     return df, version | ||||
| 
 | ||||
| def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]: | ||||
|     """ | ||||
|     Downloads the EPOCH AI remote work task data. | ||||
|     The version is the sha256 of the downloaded CSV file. | ||||
|     """ | ||||
|     # This is the direct download link constructed from the Google Drive share link | ||||
|     url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" | ||||
|     logger.info(f"Downloading EPOCH remote data from Google Drive: {url}") | ||||
| 
 | ||||
|     # Need to handle potential cookies/redirects from Google Drive | ||||
|     session = requests.Session() | ||||
|     response = session.get(url, stream=True) | ||||
|     response.raise_for_status() | ||||
| 
 | ||||
|     csv_content = response.content | ||||
|     version = hashlib.sha256(csv_content).hexdigest() | ||||
|     logger.info(f"EPOCH remote data version (sha256): {version}") | ||||
| 
 | ||||
|     parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet" | ||||
|     if parquet_path.exists(): | ||||
|         logger.info(f"Using cached EPOCH remote data: {parquet_path}") | ||||
|         return pd.read_parquet(parquet_path), version | ||||
| 
 | ||||
|     logger.info(f"Creating new EPOCH remote data cache: {parquet_path}") | ||||
|     df = pd.read_csv(io.BytesIO(csv_content)) | ||||
|     df.to_parquet(parquet_path) | ||||
|     logger.info(f"Saved EPOCH remote data to cache: {parquet_path}") | ||||
| 
 | ||||
|     return df, version | ||||
|  |  | |||
|  | @ -2,5 +2,5 @@ from ..run import Run | |||
| from pathlib import Path | ||||
| from typing import Generator | ||||
| 
 | ||||
| def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]: | ||||
| def generate_estimate_histplot(run: Run) -> Generator[Path]: | ||||
|     raise NotImplementedError | ||||
|  |  | |||
							
								
								
									
										24
									
								
								pipeline/logger.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								pipeline/logger.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,24 @@ | |||
| import logging | ||||
| from logging.handlers import RotatingFileHandler | ||||
| from rich.logging import RichHandler | ||||
| 
 | ||||
| LOGGER_NAME = "pipeline" | ||||
| 
 | ||||
| def setup_logging() -> logging.Logger: | ||||
|     # Set up Rich console handler | ||||
|     rich_handler = RichHandler( | ||||
|         level=logging.DEBUG, | ||||
|         show_time=True, | ||||
|         enable_link_path=True, | ||||
|         rich_tracebacks=True, | ||||
|         # omit_repeated_times=False, | ||||
|     ) | ||||
| 
 | ||||
|     logger = logging.getLogger(LOGGER_NAME) | ||||
|     logger.setLevel(logging.DEBUG) | ||||
|     logger.addHandler(rich_handler) | ||||
| 
 | ||||
|     return logger | ||||
| 
 | ||||
| 
 | ||||
| logger = setup_logging() | ||||
|  | @ -16,6 +16,7 @@ class Metadata(BaseModel): | |||
|     versions, and other important information. | ||||
|     """ | ||||
|     fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict) | ||||
|     enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict) | ||||
| 
 | ||||
|     ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S")) | ||||
|     commit: str = Field(default_factory=lambda: _get_current_commit()) | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| from pydantic import BaseModel, Field | ||||
| import sqlite3 | ||||
| import pandas as pd | ||||
| from pathlib import Path | ||||
| from typing import Optional | ||||
| from .metadata import Metadata | ||||
| 
 | ||||
|  | @ -20,3 +21,6 @@ class Run(BaseModel): | |||
|     task_estimates_df: Optional[pd.DataFrame] = None | ||||
| 
 | ||||
|     meta: Metadata = Field(default_factory=Metadata) | ||||
| 
 | ||||
|     cache_dir: Path | ||||
|     output_dir: Path | ||||
|  |  | |||
|  | @ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks | |||
| from .generators import GENERATORS | ||||
| from .run import Run | ||||
| from .constants import GRAY | ||||
| import platformdirs | ||||
| import seaborn as sns | ||||
| import matplotlib as mpl | ||||
| from pathlib import Path | ||||
| from typings import Optional | ||||
| 
 | ||||
| CACHE_DIR = platformdirs.user_cache_dir("econtai") | ||||
| 
 | ||||
| def run(output_dir: Optional[str] = None): | ||||
|     if output_dir is None: | ||||
|         output_dir = Path(".") | ||||
|  | @ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None): | |||
|     load_dotenv() | ||||
|     _setup_graph_rendering() | ||||
| 
 | ||||
|     current_run = Run() | ||||
|     current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR) | ||||
| 
 | ||||
|     # Fetchers (fetchers.py) | ||||
|     current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta) | ||||
|     current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta) | ||||
|     current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta) | ||||
|     current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run) | ||||
|     current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run) | ||||
|     current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run) | ||||
| 
 | ||||
|     # Enrichments (enrichments.py) | ||||
|     current_run.task_estimateability_df = enrich_with_task_estimateability(current_run) | ||||
|  | @ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None): | |||
| 
 | ||||
|     # Generators (generators/) | ||||
|     for gen in GENERATORS: | ||||
|         gen(current_run, output_dir) | ||||
|         gen(current_run) | ||||
| 
 | ||||
| 
 | ||||
| def _setup_graph_rendering(): | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Félix Dorn
						Félix Dorn