sprint-econtai/analysis/data.py

import logging
import re
import requests
import shutil
import sqlite3
import zipfile
from pathlib import Path

# Configure logging to provide feedback during the data setup process
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Constants ---
# Using a data directory at the root of the project
DATA_DIR = Path("data")

# O*NET database details. We download the MySQL version and convert it to SQLite.
ONET_MYSQL_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_mysql.zip"
DB_ZIP_PATH = DATA_DIR / "onet_mysql.zip"
DB_FILE_PATH = DATA_DIR / "onet.db"
EXTRACT_DIR = DATA_DIR / "onet_mysql_extracted"

# URLs for other required data files are in a separate text data archive.
ONET_TEXT_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_text.zip"
TEXT_ZIP_PATH = DATA_DIR / "onet_text.zip"
TASK_RATINGS_PATH = DATA_DIR / "Task Ratings.txt"
DWA_REFERENCE_PATH = DATA_DIR / "DWA Reference.txt"


def setup_data_and_database():
    """
    Main function to orchestrate the data setup.
    It ensures the data directory exists, then downloads and sets up the O*NET database
    and any other required data files.
    """
    logging.info("Starting data and database setup...")
    DATA_DIR.mkdir(exist_ok=True)

    _setup_onet_database()
    _download_additional_data()

    logging.info("Data and database setup complete.")


def _setup_onet_database():
    """
    Downloads the O*NET MySQL database, extracts it, and imports it into a
    new SQLite database, following performance best practices from a shell script.
    This method performs minimal text-based conversion of the MySQL dump to
    make it compatible with SQLite before importing.
    """
    if DB_FILE_PATH.exists():
        logging.info("O*NET database already exists at %s. Skipping setup.", DB_FILE_PATH)
        return

    logging.info("O*NET database not found. Starting fresh setup.")
    # Ensure the extraction directory is clean before use
    if EXTRACT_DIR.exists():
        shutil.rmtree(EXTRACT_DIR)
    EXTRACT_DIR.mkdir()

    try:
        # 1. Download if necessary
        if not DB_ZIP_PATH.exists():
            logging.info("Downloading O*NET database from %s", ONET_MYSQL_URL)
            _download_file(ONET_MYSQL_URL, DB_ZIP_PATH)
        else:
            logging.info("Using existing O*NET zip file at %s", DB_ZIP_PATH)

        # 2. Extract
        logging.info("Extracting O*NET database files to %s", EXTRACT_DIR)
        with zipfile.ZipFile(DB_ZIP_PATH, 'r') as zip_ref:
            zip_ref.extractall(EXTRACT_DIR)

        # 3. Create new DB with performance PRAGMAs
        logging.info("Creating new SQLite database with performance settings: %s", DB_FILE_PATH)
        conn = sqlite3.connect(DB_FILE_PATH)
        conn.executescript("""
            PRAGMA journal_mode = OFF;
            PRAGMA synchronous = 0;
            PRAGMA cache_size = 1000000;
            PRAGMA locking_mode = EXCLUSIVE;
            PRAGMA temp_store = MEMORY;
        """)
        conn.close()

        # 4. Combine all SQL files, convert, and import in a single transaction
        logging.info("Combining and converting SQL files for single transaction import...")
        sql_files = sorted(EXTRACT_DIR.rglob('*.sql'))
        if not sql_files:
            raise FileNotFoundError(f"No SQL files found in {EXTRACT_DIR}")

        # Concatenate all files into one string
        mysql_dump = "\n".join([sql_file.read_text(encoding='utf-8') for sql_file in sql_files])

        # Minimal conversion for SQLite: remove backticks and ENGINE clauses
        sqlite_dump = mysql_dump.replace('`', '')
        sqlite_dump = re.sub(r'\) ENGINE=InnoDB.*?;', ');', sqlite_dump, flags=re.DOTALL)

        full_script = f"BEGIN TRANSACTION;\n{sqlite_dump}\nCOMMIT;"

        logging.info(f"Importing {len(sql_files)} SQL files into database...")
        conn = sqlite3.connect(DB_FILE_PATH)
        conn.executescript(full_script)
        conn.close()
        logging.info("Database populated successfully.")

        # 5. Restore reliability settings and optimize
        logging.info("Restoring reliability settings and optimizing database...")
        conn = sqlite3.connect(DB_FILE_PATH)
        conn.executescript("""
            PRAGMA journal_mode = WAL;
            PRAGMA synchronous = NORMAL;
            PRAGMA locking_mode = NORMAL;
            PRAGMA temp_store = DEFAULT;
            PRAGMA foreign_keys = ON;
            PRAGMA optimize;
        """)
        conn.execute("VACUUM;")
        conn.close()
        logging.info("Database setup and optimization complete.")

    except Exception as e:
        logging.error("Failed during database setup: %s", e, exc_info=True)
        if DB_FILE_PATH.exists():
            DB_FILE_PATH.unlink()
        raise
    finally:
        # 6. Cleanup
        logging.info("Cleaning up temporary files...")
        if DB_ZIP_PATH.exists():
            DB_ZIP_PATH.unlink()
        if EXTRACT_DIR.exists():
            shutil.rmtree(EXTRACT_DIR)


def _download_additional_data():
    """
    Downloads and extracts supplementary data files from the O*NET text archive.
    If the required text files already exist, this function does nothing.
    """
    required_files = [TASK_RATINGS_PATH, DWA_REFERENCE_PATH]
    if all(p.exists() for p in required_files):
        logging.info("All required text data files already exist. Skipping download.")
        return

    logging.info("One or more text data files are missing. Downloading and extracting from archive...")
    try:
        _download_file(ONET_TEXT_URL, TEXT_ZIP_PATH)
        logging.info("Unzipping text data archive...")
        with zipfile.ZipFile(TEXT_ZIP_PATH, 'r') as zip_ref:
            # Extract only the files we need, without creating subdirectories
            for target_path in required_files:
                if not target_path.exists():
                    # Find the corresponding file within the zip archive's directory structure
                    member_name = next((m for m in zip_ref.namelist() if m.endswith(target_path.name)), None)
                    if member_name:
                        with zip_ref.open(member_name) as source, open(target_path, 'wb') as target:
                            target.write(source.read())
                        logging.info("Extracted %s", target_path.name)
                    else:
                        logging.warning("Could not find %s in the text data archive.", target_path.name)

    except requests.exceptions.RequestException as e:
        logging.error("Failed to download O*NET text data archive: %s", e)
        raise
    except zipfile.BadZipFile as e:
        logging.error("Failed to process the text data archive: %s", e)
        raise
    finally:
        # Clean up the downloaded zip file
        if TEXT_ZIP_PATH.exists():
            TEXT_ZIP_PATH.unlink()
            logging.info("Cleaned up downloaded text archive zip file.")


def _download_file(url, destination):
    """
    Helper function to download a file from a URL, with streaming for large files.
    """
    logging.info("Downloading from %s to %s", url, destination)
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(destination, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    logging.info("Download of %s complete.", destination.name)


def get_db_connection():
    """
    Establishes and returns a connection to the SQLite database.
    Returns None if the database file does not exist.
    """
    if not DB_FILE_PATH.exists():
        logging.error("Database file not found at %s. Run the setup process first.", DB_FILE_PATH)
        return None
    try:
        conn = sqlite3.connect(DB_FILE_PATH)
        return conn
    except sqlite3.Error as e:
        logging.error("Failed to connect to the database: %s", e)
        return None

if __name__ == '__main__':
    # This allows the data setup to be run directly from the command line,
    # which is useful for initialization or debugging.
    setup_data_and_database()