sprint-econtai/archive/onet_explorer_app.py

import streamlit as st
import sqlite3
import pandas as pd
import graphviz
import textwrap

# --- Database Setup ---
DB_FILE = "onet.database"


@st.cache_resource
def get_db_connection():
    """Establishes a connection to the SQLite database."""
    conn = sqlite3.connect(DB_FILE)
    conn.row_factory = sqlite3.Row  # Access columns by name
    return conn


@st.cache_data
def get_occupations(_conn):
    """Fetches all occupations from the database."""
    df = pd.read_sql_query(
        "SELECT onetsoc_code, title FROM occupation_data ORDER BY title", _conn
    )
    return df


@st.cache_data
def get_iwas_for_occupation(_conn, onetsoc_code):
    """
    Fetches IWAs for a given occupation.
    An occupation is linked to Work Activities (element_id in work_activities table).
    These Work Activity element_ids are then used in iwa_reference to find associated IWAs.
    """
    query = """
    SELECT DISTINCT
        ir.iwa_id,
        ir.iwa_title
    FROM work_activities wa
    JOIN iwa_reference ir ON wa.element_id = ir.element_id
    WHERE wa.onetsoc_code = ?
    ORDER BY ir.iwa_title;
    """
    df = pd.read_sql_query(query, _conn, params=(onetsoc_code,))
    return df


@st.cache_data
def get_dwas_for_iwas(_conn, iwa_ids):
    """Fetches DWAs for a list of IWA IDs."""
    if not iwa_ids:
        return pd.DataFrame()
    placeholders = ",".join(
        "?" for _ in iwa_ids
    )  # Create one placeholder for each IWA ID
    query = f"""
    SELECT DISTINCT
        dr.dwa_id,
        dr.dwa_title,
        dr.iwa_id  -- to link back to the IWA
    FROM dwa_reference dr
    WHERE dr.iwa_id IN ({placeholders})
    ORDER BY dr.dwa_title;
    """
    df = pd.read_sql_query(query, _conn, params=iwa_ids)
    return df


@st.cache_data
def get_tasks_for_dwas(_conn, onetsoc_code, dwa_ids):
    """Fetches tasks for a given occupation and list of DWA IDs."""
    if not dwa_ids:
        return pd.DataFrame()
    placeholders = ",".join(
        "?" for _ in dwa_ids
    )  # Create one placeholder for each DWA ID
    query = f"""
    SELECT DISTINCT
        ts.task_id,
        ts.task,
        t2d.dwa_id -- to link back to the DWA
    FROM task_statements ts
    JOIN tasks_to_dwas t2d ON ts.task_id = t2d.task_id
    WHERE ts.onetsoc_code = ? AND t2d.dwa_id IN ({placeholders})
    ORDER BY ts.task;
    """
    # The parameters list should first contain onetsoc_code, then all DWA IDs.
    params = [onetsoc_code] + dwa_ids
    df = pd.read_sql_query(query, _conn, params=params)
    return df


def smart_wrap(text, width=40):
    """Wraps text for better display in graph nodes."""
    return "\n".join(
        textwrap.wrap(
            text,
            width=width,
            break_long_words=True,
            replace_whitespace=False,
            drop_whitespace=False,
        )
    )


# --- Streamlit App Layout ---
st.set_page_config(layout="wide")

# Check if database file exists
try:
    # Attempt to open for binary read to check existence and basic readability
    with open(DB_FILE, "rb") as f:
        pass
    conn = get_db_connection()
except FileNotFoundError:
    st.error(
        f"Database file '{DB_FILE}' not found. Please ensure it is in the same directory as the script."
    )
    st.stop()
except sqlite3.Error as e:
    st.error(f"Error connecting to or reading the database '{DB_FILE}': {e}")
    st.info(
        "Please ensure the database file is a valid SQLite database and not corrupted."
    )
    st.stop()


st.title("O*NET Occupation Hierarchy Explorer")
st.markdown("""
This application visualizes the relationships between Occupations, Intermediate Work Activities (IWAs),
Detailed Work Activities (DWAs), and Task Statements from the O*NET database.
Select an occupation from the control panel on the left to view its hierarchical breakdown.
""")

# --- Sidebar for Occupation Selection ---
col1, col2 = st.columns([0.3, 0.7], gap="large")

with col1:
    st.header("Control Panel")
    occupations_df = get_occupations(conn)

    if occupations_df.empty:
        st.warning("No occupations found in the database.")
        st.stop()

    # Create a display string with code and title for the selectbox
    occupations_df["display_name"] = (
        occupations_df["title"] + " (" + occupations_df["onetsoc_code"] + ")"
    )

    search_term = st.text_input(
        "Search for an occupation:", placeholder="E.g., Software Developer"
    )

    if search_term:
        # Ensure search term is treated as a literal string for regex, if needed, or use basic string methods
        search_term_safe = (
            search_term.replace("[", "\\[")
            .replace("]", "\\]")
            .replace("(", "\\(")
            .replace(")", "\\)")
        )
        filtered_occupations = occupations_df[
            occupations_df["title"].str.contains(
                search_term_safe, case=False, regex=True
            )
            | occupations_df["onetsoc_code"].str.contains(
                search_term_safe, case=False, regex=True
            )
        ]
    else:
        filtered_occupations = occupations_df

    if not filtered_occupations.empty:
        # Sort filtered occupations for consistent display in selectbox
        filtered_occupations_sorted = filtered_occupations.sort_values("display_name")
        selected_occupation_display_name = st.selectbox(
            "Choose an occupation:",
            options=filtered_occupations_sorted["display_name"],
            index=0,  # Default to the first item
        )

        # Get the onetsoc_code and title from the selected display name
        selected_row = occupations_df[
            occupations_df["display_name"] == selected_occupation_display_name
        ].iloc[0]
        selected_onetsoc_code = selected_row["onetsoc_code"]
        selected_occupation_title = selected_row["title"]
    else:
        st.warning("No occupations match your search term.")
        selected_onetsoc_code = None
        selected_occupation_title = None

# --- Main Area for Graph Display ---
with col2:
    st.header("Occupation Graph")
    if selected_onetsoc_code:
        st.subheader(
            f"Displaying: {selected_occupation_title} ({selected_onetsoc_code})"
        )

        iwas_df = get_iwas_for_occupation(conn, selected_onetsoc_code)

        if iwas_df.empty:
            st.info(
                "No Intermediate Work Activities (IWAs) found directly linked for this occupation."
            )
        else:
            graph = graphviz.Digraph(
                comment=f"O*NET Hierarchy for {selected_onetsoc_code}"
            )
            graph.attr(
                rankdir="LR",
                splines="spline",
                concentrate="false",
                nodesep="0.5",
                ranksep="0.8",
            )

            # Occupation Node
            occ_node_id = f"occ_{selected_onetsoc_code.replace('.', '_')}"  # Ensure ID is valid for DOT
            occ_label = smart_wrap(
                f"Occupation: {selected_occupation_title}\n({selected_onetsoc_code})",
                width=30,
            )
            graph.node(
                occ_node_id,
                label=occ_label,
                shape="ellipse",
                style="filled",
                fillcolor="skyblue",
            )

            # Fetch DWAs
            iwa_ids = iwas_df["iwa_id"].tolist()
            dwas_df = get_dwas_for_iwas(conn, iwa_ids)

            dwa_ids_for_tasks = []
            if not dwas_df.empty:
                dwa_ids_for_tasks = dwas_df["dwa_id"].unique().tolist()

            # Fetch Tasks
            tasks_df = get_tasks_for_dwas(
                conn, selected_onetsoc_code, dwa_ids_for_tasks
            )

            # Add IWA Nodes and Edges
            for _, iwa_row in iwas_df.iterrows():
                iwa_node_id = f"iwa_{str(iwa_row['iwa_id']).replace('.', '_')}"
                iwa_label = smart_wrap(
                    f"IWA: {iwa_row['iwa_title']}\n(ID: {iwa_row['iwa_id']})", width=35
                )
                graph.node(
                    iwa_node_id,
                    label=iwa_label,
                    shape="box",
                    style="filled",
                    fillcolor="khaki",
                )
                graph.edge(occ_node_id, iwa_node_id)

                # Add DWA Nodes and Edges (for this IWA)
                current_iwa_dwas = dwas_df[dwas_df["iwa_id"] == iwa_row["iwa_id"]]
                for _, dwa_row in current_iwa_dwas.iterrows():
                    dwa_node_id = f"dwa_{str(dwa_row['dwa_id']).replace('.', '_')}"
                    dwa_label = smart_wrap(
                        f"DWA: {dwa_row['dwa_title']}\n(ID: {dwa_row['dwa_id']})",
                        width=40,
                    )
                    graph.node(
                        dwa_node_id,
                        label=dwa_label,
                        shape="box",
                        style="filled",
                        fillcolor="lightcoral",
                    )
                    graph.edge(iwa_node_id, dwa_node_id)

                    # Add Task Nodes and Edges (for this DWA and Occupation)
                    current_dwa_tasks = tasks_df[
                        tasks_df["dwa_id"] == dwa_row["dwa_id"]
                    ]
                    for _, task_row in current_dwa_tasks.iterrows():
                        # Ensure task_id is a string and valid for DOT
                        task_id_str = str(task_row["task_id"]).split(".")[
                            0
                        ]  # Handle decimal task_ids if they appear
                        task_node_id = f"task_{task_id_str}"
                        task_label = smart_wrap(
                            f"Task: {task_row['task']}\n(ID: {task_id_str})", width=50
                        )
                        graph.node(
                            task_node_id,
                            label=task_label,
                            shape="note",
                            style="filled",
                            fillcolor="lightgray",
                        )
                        graph.edge(dwa_node_id, task_node_id)

            if (
                not graph.body or len(graph.body) <= 1
            ):  # Check if any nodes were actually added beyond the occupation
                st.info(
                    "No hierarchical data (IWAs, DWAs, Tasks) to display for this occupation after initial selection."
                )
            else:
                try:
                    st.graphviz_chart(graph, use_container_width=True)
                    with st.expander("View Data Tables for Selected Occupation"):
                        st.markdown("##### Intermediate Work Activities (IWAs)")
                        st.dataframe(iwas_df, use_container_width=True)
                        if not dwas_df.empty:
                            st.markdown("##### Detailed Work Activities (DWAs)")
                            st.dataframe(dwas_df, use_container_width=True)
                        if not tasks_df.empty:
                            st.markdown("##### Task Statements")
                            st.dataframe(tasks_df, use_container_width=True)

                except Exception as e:
                    st.error(
                        f"Could not render the graph. Graphviz might not be installed correctly or there's an issue with the graph data: {e}"
                    )
                    st.text("Graphviz DOT source (for debugging):")
                    st.code(graph.source, language="dot")
    else:
        st.info("Select an occupation from the control panel to see its graph.")

# Instructions to run the app:
# 1. Save this code as a Python file (e.g., onet_explorer_app.py).
# 2. Ensure the 'onet.database' file is in the same directory.
# 3. Install the required libraries: pip install streamlit pandas graphviz
# 4. Open your terminal or command prompt, navigate to the directory, and run:
#    streamlit run onet_explorer_app.py