import streamlit as st import sqlite3 import pandas as pd import graphviz import textwrap # --- Database Setup --- DB_FILE = "onet.database" @st.cache_resource def get_db_connection(): """Establishes a connection to the SQLite database.""" conn = sqlite3.connect(DB_FILE) conn.row_factory = sqlite3.Row # Access columns by name return conn @st.cache_data def get_occupations(_conn): """Fetches all occupations from the database.""" df = pd.read_sql_query( "SELECT onetsoc_code, title FROM occupation_data ORDER BY title", _conn ) return df @st.cache_data def get_iwas_for_occupation(_conn, onetsoc_code): """ Fetches IWAs for a given occupation. An occupation is linked to Work Activities (element_id in work_activities table). These Work Activity element_ids are then used in iwa_reference to find associated IWAs. """ query = """ SELECT DISTINCT ir.iwa_id, ir.iwa_title FROM work_activities wa JOIN iwa_reference ir ON wa.element_id = ir.element_id WHERE wa.onetsoc_code = ? ORDER BY ir.iwa_title; """ df = pd.read_sql_query(query, _conn, params=(onetsoc_code,)) return df @st.cache_data def get_dwas_for_iwas(_conn, iwa_ids): """Fetches DWAs for a list of IWA IDs.""" if not iwa_ids: return pd.DataFrame() placeholders = ",".join( "?" for _ in iwa_ids ) # Create one placeholder for each IWA ID query = f""" SELECT DISTINCT dr.dwa_id, dr.dwa_title, dr.iwa_id -- to link back to the IWA FROM dwa_reference dr WHERE dr.iwa_id IN ({placeholders}) ORDER BY dr.dwa_title; """ df = pd.read_sql_query(query, _conn, params=iwa_ids) return df @st.cache_data def get_tasks_for_dwas(_conn, onetsoc_code, dwa_ids): """Fetches tasks for a given occupation and list of DWA IDs.""" if not dwa_ids: return pd.DataFrame() placeholders = ",".join( "?" for _ in dwa_ids ) # Create one placeholder for each DWA ID query = f""" SELECT DISTINCT ts.task_id, ts.task, t2d.dwa_id -- to link back to the DWA FROM task_statements ts JOIN tasks_to_dwas t2d ON ts.task_id = t2d.task_id WHERE ts.onetsoc_code = ? AND t2d.dwa_id IN ({placeholders}) ORDER BY ts.task; """ # The parameters list should first contain onetsoc_code, then all DWA IDs. params = [onetsoc_code] + dwa_ids df = pd.read_sql_query(query, _conn, params=params) return df def smart_wrap(text, width=40): """Wraps text for better display in graph nodes.""" return "\n".join( textwrap.wrap( text, width=width, break_long_words=True, replace_whitespace=False, drop_whitespace=False, ) ) # --- Streamlit App Layout --- st.set_page_config(layout="wide") # Check if database file exists try: # Attempt to open for binary read to check existence and basic readability with open(DB_FILE, "rb") as f: pass conn = get_db_connection() except FileNotFoundError: st.error( f"Database file '{DB_FILE}' not found. Please ensure it is in the same directory as the script." ) st.stop() except sqlite3.Error as e: st.error(f"Error connecting to or reading the database '{DB_FILE}': {e}") st.info( "Please ensure the database file is a valid SQLite database and not corrupted." ) st.stop() st.title("O*NET Occupation Hierarchy Explorer") st.markdown(""" This application visualizes the relationships between Occupations, Intermediate Work Activities (IWAs), Detailed Work Activities (DWAs), and Task Statements from the O*NET database. Select an occupation from the control panel on the left to view its hierarchical breakdown. """) # --- Sidebar for Occupation Selection --- col1, col2 = st.columns([0.3, 0.7], gap="large") with col1: st.header("Control Panel") occupations_df = get_occupations(conn) if occupations_df.empty: st.warning("No occupations found in the database.") st.stop() # Create a display string with code and title for the selectbox occupations_df["display_name"] = ( occupations_df["title"] + " (" + occupations_df["onetsoc_code"] + ")" ) search_term = st.text_input( "Search for an occupation:", placeholder="E.g., Software Developer" ) if search_term: # Ensure search term is treated as a literal string for regex, if needed, or use basic string methods search_term_safe = ( search_term.replace("[", "\\[") .replace("]", "\\]") .replace("(", "\\(") .replace(")", "\\)") ) filtered_occupations = occupations_df[ occupations_df["title"].str.contains( search_term_safe, case=False, regex=True ) | occupations_df["onetsoc_code"].str.contains( search_term_safe, case=False, regex=True ) ] else: filtered_occupations = occupations_df if not filtered_occupations.empty: # Sort filtered occupations for consistent display in selectbox filtered_occupations_sorted = filtered_occupations.sort_values("display_name") selected_occupation_display_name = st.selectbox( "Choose an occupation:", options=filtered_occupations_sorted["display_name"], index=0, # Default to the first item ) # Get the onetsoc_code and title from the selected display name selected_row = occupations_df[ occupations_df["display_name"] == selected_occupation_display_name ].iloc[0] selected_onetsoc_code = selected_row["onetsoc_code"] selected_occupation_title = selected_row["title"] else: st.warning("No occupations match your search term.") selected_onetsoc_code = None selected_occupation_title = None # --- Main Area for Graph Display --- with col2: st.header("Occupation Graph") if selected_onetsoc_code: st.subheader( f"Displaying: {selected_occupation_title} ({selected_onetsoc_code})" ) iwas_df = get_iwas_for_occupation(conn, selected_onetsoc_code) if iwas_df.empty: st.info( "No Intermediate Work Activities (IWAs) found directly linked for this occupation." ) else: graph = graphviz.Digraph( comment=f"O*NET Hierarchy for {selected_onetsoc_code}" ) graph.attr( rankdir="LR", splines="spline", concentrate="false", nodesep="0.5", ranksep="0.8", ) # Occupation Node occ_node_id = f"occ_{selected_onetsoc_code.replace('.', '_')}" # Ensure ID is valid for DOT occ_label = smart_wrap( f"Occupation: {selected_occupation_title}\n({selected_onetsoc_code})", width=30, ) graph.node( occ_node_id, label=occ_label, shape="ellipse", style="filled", fillcolor="skyblue", ) # Fetch DWAs iwa_ids = iwas_df["iwa_id"].tolist() dwas_df = get_dwas_for_iwas(conn, iwa_ids) dwa_ids_for_tasks = [] if not dwas_df.empty: dwa_ids_for_tasks = dwas_df["dwa_id"].unique().tolist() # Fetch Tasks tasks_df = get_tasks_for_dwas( conn, selected_onetsoc_code, dwa_ids_for_tasks ) # Add IWA Nodes and Edges for _, iwa_row in iwas_df.iterrows(): iwa_node_id = f"iwa_{str(iwa_row['iwa_id']).replace('.', '_')}" iwa_label = smart_wrap( f"IWA: {iwa_row['iwa_title']}\n(ID: {iwa_row['iwa_id']})", width=35 ) graph.node( iwa_node_id, label=iwa_label, shape="box", style="filled", fillcolor="khaki", ) graph.edge(occ_node_id, iwa_node_id) # Add DWA Nodes and Edges (for this IWA) current_iwa_dwas = dwas_df[dwas_df["iwa_id"] == iwa_row["iwa_id"]] for _, dwa_row in current_iwa_dwas.iterrows(): dwa_node_id = f"dwa_{str(dwa_row['dwa_id']).replace('.', '_')}" dwa_label = smart_wrap( f"DWA: {dwa_row['dwa_title']}\n(ID: {dwa_row['dwa_id']})", width=40, ) graph.node( dwa_node_id, label=dwa_label, shape="box", style="filled", fillcolor="lightcoral", ) graph.edge(iwa_node_id, dwa_node_id) # Add Task Nodes and Edges (for this DWA and Occupation) current_dwa_tasks = tasks_df[ tasks_df["dwa_id"] == dwa_row["dwa_id"] ] for _, task_row in current_dwa_tasks.iterrows(): # Ensure task_id is a string and valid for DOT task_id_str = str(task_row["task_id"]).split(".")[ 0 ] # Handle decimal task_ids if they appear task_node_id = f"task_{task_id_str}" task_label = smart_wrap( f"Task: {task_row['task']}\n(ID: {task_id_str})", width=50 ) graph.node( task_node_id, label=task_label, shape="note", style="filled", fillcolor="lightgray", ) graph.edge(dwa_node_id, task_node_id) if ( not graph.body or len(graph.body) <= 1 ): # Check if any nodes were actually added beyond the occupation st.info( "No hierarchical data (IWAs, DWAs, Tasks) to display for this occupation after initial selection." ) else: try: st.graphviz_chart(graph, use_container_width=True) with st.expander("View Data Tables for Selected Occupation"): st.markdown("##### Intermediate Work Activities (IWAs)") st.dataframe(iwas_df, use_container_width=True) if not dwas_df.empty: st.markdown("##### Detailed Work Activities (DWAs)") st.dataframe(dwas_df, use_container_width=True) if not tasks_df.empty: st.markdown("##### Task Statements") st.dataframe(tasks_df, use_container_width=True) except Exception as e: st.error( f"Could not render the graph. Graphviz might not be installed correctly or there's an issue with the graph data: {e}" ) st.text("Graphviz DOT source (for debugging):") st.code(graph.source, language="dot") else: st.info("Select an occupation from the control panel to see its graph.") # Instructions to run the app: # 1. Save this code as a Python file (e.g., onet_explorer_app.py). # 2. Ensure the 'onet.database' file is in the same directory. # 3. Install the required libraries: pip install streamlit pandas graphviz # 4. Open your terminal or command prompt, navigate to the directory, and run: # streamlit run onet_explorer_app.py