sprint-econtai/archive/onet_explorer_app.py
Félix Dorn 43076bcbb1 old
2025-07-15 00:41:05 +02:00

334 lines
12 KiB
Python

import streamlit as st
import sqlite3
import pandas as pd
import graphviz
import textwrap
# --- Database Setup ---
DB_FILE = "onet.database"
@st.cache_resource
def get_db_connection():
"""Establishes a connection to the SQLite database."""
conn = sqlite3.connect(DB_FILE)
conn.row_factory = sqlite3.Row # Access columns by name
return conn
@st.cache_data
def get_occupations(_conn):
"""Fetches all occupations from the database."""
df = pd.read_sql_query(
"SELECT onetsoc_code, title FROM occupation_data ORDER BY title", _conn
)
return df
@st.cache_data
def get_iwas_for_occupation(_conn, onetsoc_code):
"""
Fetches IWAs for a given occupation.
An occupation is linked to Work Activities (element_id in work_activities table).
These Work Activity element_ids are then used in iwa_reference to find associated IWAs.
"""
query = """
SELECT DISTINCT
ir.iwa_id,
ir.iwa_title
FROM work_activities wa
JOIN iwa_reference ir ON wa.element_id = ir.element_id
WHERE wa.onetsoc_code = ?
ORDER BY ir.iwa_title;
"""
df = pd.read_sql_query(query, _conn, params=(onetsoc_code,))
return df
@st.cache_data
def get_dwas_for_iwas(_conn, iwa_ids):
"""Fetches DWAs for a list of IWA IDs."""
if not iwa_ids:
return pd.DataFrame()
placeholders = ",".join(
"?" for _ in iwa_ids
) # Create one placeholder for each IWA ID
query = f"""
SELECT DISTINCT
dr.dwa_id,
dr.dwa_title,
dr.iwa_id -- to link back to the IWA
FROM dwa_reference dr
WHERE dr.iwa_id IN ({placeholders})
ORDER BY dr.dwa_title;
"""
df = pd.read_sql_query(query, _conn, params=iwa_ids)
return df
@st.cache_data
def get_tasks_for_dwas(_conn, onetsoc_code, dwa_ids):
"""Fetches tasks for a given occupation and list of DWA IDs."""
if not dwa_ids:
return pd.DataFrame()
placeholders = ",".join(
"?" for _ in dwa_ids
) # Create one placeholder for each DWA ID
query = f"""
SELECT DISTINCT
ts.task_id,
ts.task,
t2d.dwa_id -- to link back to the DWA
FROM task_statements ts
JOIN tasks_to_dwas t2d ON ts.task_id = t2d.task_id
WHERE ts.onetsoc_code = ? AND t2d.dwa_id IN ({placeholders})
ORDER BY ts.task;
"""
# The parameters list should first contain onetsoc_code, then all DWA IDs.
params = [onetsoc_code] + dwa_ids
df = pd.read_sql_query(query, _conn, params=params)
return df
def smart_wrap(text, width=40):
"""Wraps text for better display in graph nodes."""
return "\n".join(
textwrap.wrap(
text,
width=width,
break_long_words=True,
replace_whitespace=False,
drop_whitespace=False,
)
)
# --- Streamlit App Layout ---
st.set_page_config(layout="wide")
# Check if database file exists
try:
# Attempt to open for binary read to check existence and basic readability
with open(DB_FILE, "rb") as f:
pass
conn = get_db_connection()
except FileNotFoundError:
st.error(
f"Database file '{DB_FILE}' not found. Please ensure it is in the same directory as the script."
)
st.stop()
except sqlite3.Error as e:
st.error(f"Error connecting to or reading the database '{DB_FILE}': {e}")
st.info(
"Please ensure the database file is a valid SQLite database and not corrupted."
)
st.stop()
st.title("O*NET Occupation Hierarchy Explorer")
st.markdown("""
This application visualizes the relationships between Occupations, Intermediate Work Activities (IWAs),
Detailed Work Activities (DWAs), and Task Statements from the O*NET database.
Select an occupation from the control panel on the left to view its hierarchical breakdown.
""")
# --- Sidebar for Occupation Selection ---
col1, col2 = st.columns([0.3, 0.7], gap="large")
with col1:
st.header("Control Panel")
occupations_df = get_occupations(conn)
if occupations_df.empty:
st.warning("No occupations found in the database.")
st.stop()
# Create a display string with code and title for the selectbox
occupations_df["display_name"] = (
occupations_df["title"] + " (" + occupations_df["onetsoc_code"] + ")"
)
search_term = st.text_input(
"Search for an occupation:", placeholder="E.g., Software Developer"
)
if search_term:
# Ensure search term is treated as a literal string for regex, if needed, or use basic string methods
search_term_safe = (
search_term.replace("[", "\\[")
.replace("]", "\\]")
.replace("(", "\\(")
.replace(")", "\\)")
)
filtered_occupations = occupations_df[
occupations_df["title"].str.contains(
search_term_safe, case=False, regex=True
)
| occupations_df["onetsoc_code"].str.contains(
search_term_safe, case=False, regex=True
)
]
else:
filtered_occupations = occupations_df
if not filtered_occupations.empty:
# Sort filtered occupations for consistent display in selectbox
filtered_occupations_sorted = filtered_occupations.sort_values("display_name")
selected_occupation_display_name = st.selectbox(
"Choose an occupation:",
options=filtered_occupations_sorted["display_name"],
index=0, # Default to the first item
)
# Get the onetsoc_code and title from the selected display name
selected_row = occupations_df[
occupations_df["display_name"] == selected_occupation_display_name
].iloc[0]
selected_onetsoc_code = selected_row["onetsoc_code"]
selected_occupation_title = selected_row["title"]
else:
st.warning("No occupations match your search term.")
selected_onetsoc_code = None
selected_occupation_title = None
# --- Main Area for Graph Display ---
with col2:
st.header("Occupation Graph")
if selected_onetsoc_code:
st.subheader(
f"Displaying: {selected_occupation_title} ({selected_onetsoc_code})"
)
iwas_df = get_iwas_for_occupation(conn, selected_onetsoc_code)
if iwas_df.empty:
st.info(
"No Intermediate Work Activities (IWAs) found directly linked for this occupation."
)
else:
graph = graphviz.Digraph(
comment=f"O*NET Hierarchy for {selected_onetsoc_code}"
)
graph.attr(
rankdir="LR",
splines="spline",
concentrate="false",
nodesep="0.5",
ranksep="0.8",
)
# Occupation Node
occ_node_id = f"occ_{selected_onetsoc_code.replace('.', '_')}" # Ensure ID is valid for DOT
occ_label = smart_wrap(
f"Occupation: {selected_occupation_title}\n({selected_onetsoc_code})",
width=30,
)
graph.node(
occ_node_id,
label=occ_label,
shape="ellipse",
style="filled",
fillcolor="skyblue",
)
# Fetch DWAs
iwa_ids = iwas_df["iwa_id"].tolist()
dwas_df = get_dwas_for_iwas(conn, iwa_ids)
dwa_ids_for_tasks = []
if not dwas_df.empty:
dwa_ids_for_tasks = dwas_df["dwa_id"].unique().tolist()
# Fetch Tasks
tasks_df = get_tasks_for_dwas(
conn, selected_onetsoc_code, dwa_ids_for_tasks
)
# Add IWA Nodes and Edges
for _, iwa_row in iwas_df.iterrows():
iwa_node_id = f"iwa_{str(iwa_row['iwa_id']).replace('.', '_')}"
iwa_label = smart_wrap(
f"IWA: {iwa_row['iwa_title']}\n(ID: {iwa_row['iwa_id']})", width=35
)
graph.node(
iwa_node_id,
label=iwa_label,
shape="box",
style="filled",
fillcolor="khaki",
)
graph.edge(occ_node_id, iwa_node_id)
# Add DWA Nodes and Edges (for this IWA)
current_iwa_dwas = dwas_df[dwas_df["iwa_id"] == iwa_row["iwa_id"]]
for _, dwa_row in current_iwa_dwas.iterrows():
dwa_node_id = f"dwa_{str(dwa_row['dwa_id']).replace('.', '_')}"
dwa_label = smart_wrap(
f"DWA: {dwa_row['dwa_title']}\n(ID: {dwa_row['dwa_id']})",
width=40,
)
graph.node(
dwa_node_id,
label=dwa_label,
shape="box",
style="filled",
fillcolor="lightcoral",
)
graph.edge(iwa_node_id, dwa_node_id)
# Add Task Nodes and Edges (for this DWA and Occupation)
current_dwa_tasks = tasks_df[
tasks_df["dwa_id"] == dwa_row["dwa_id"]
]
for _, task_row in current_dwa_tasks.iterrows():
# Ensure task_id is a string and valid for DOT
task_id_str = str(task_row["task_id"]).split(".")[
0
] # Handle decimal task_ids if they appear
task_node_id = f"task_{task_id_str}"
task_label = smart_wrap(
f"Task: {task_row['task']}\n(ID: {task_id_str})", width=50
)
graph.node(
task_node_id,
label=task_label,
shape="note",
style="filled",
fillcolor="lightgray",
)
graph.edge(dwa_node_id, task_node_id)
if (
not graph.body or len(graph.body) <= 1
): # Check if any nodes were actually added beyond the occupation
st.info(
"No hierarchical data (IWAs, DWAs, Tasks) to display for this occupation after initial selection."
)
else:
try:
st.graphviz_chart(graph, use_container_width=True)
with st.expander("View Data Tables for Selected Occupation"):
st.markdown("##### Intermediate Work Activities (IWAs)")
st.dataframe(iwas_df, use_container_width=True)
if not dwas_df.empty:
st.markdown("##### Detailed Work Activities (DWAs)")
st.dataframe(dwas_df, use_container_width=True)
if not tasks_df.empty:
st.markdown("##### Task Statements")
st.dataframe(tasks_df, use_container_width=True)
except Exception as e:
st.error(
f"Could not render the graph. Graphviz might not be installed correctly or there's an issue with the graph data: {e}"
)
st.text("Graphviz DOT source (for debugging):")
st.code(graph.source, language="dot")
else:
st.info("Select an occupation from the control panel to see its graph.")
# Instructions to run the app:
# 1. Save this code as a Python file (e.g., onet_explorer_app.py).
# 2. Ensure the 'onet.database' file is in the same directory.
# 3. Install the required libraries: pip install streamlit pandas graphviz
# 4. Open your terminal or command prompt, navigate to the directory, and run:
# streamlit run onet_explorer_app.py