334 lines
12 KiB
Python
334 lines
12 KiB
Python
import streamlit as st
|
|
import sqlite3
|
|
import pandas as pd
|
|
import graphviz
|
|
import textwrap
|
|
|
|
# --- Database Setup ---
|
|
DB_FILE = "onet.database"
|
|
|
|
|
|
@st.cache_resource
|
|
def get_db_connection():
|
|
"""Establishes a connection to the SQLite database."""
|
|
conn = sqlite3.connect(DB_FILE)
|
|
conn.row_factory = sqlite3.Row # Access columns by name
|
|
return conn
|
|
|
|
|
|
@st.cache_data
|
|
def get_occupations(_conn):
|
|
"""Fetches all occupations from the database."""
|
|
df = pd.read_sql_query(
|
|
"SELECT onetsoc_code, title FROM occupation_data ORDER BY title", _conn
|
|
)
|
|
return df
|
|
|
|
|
|
@st.cache_data
|
|
def get_iwas_for_occupation(_conn, onetsoc_code):
|
|
"""
|
|
Fetches IWAs for a given occupation.
|
|
An occupation is linked to Work Activities (element_id in work_activities table).
|
|
These Work Activity element_ids are then used in iwa_reference to find associated IWAs.
|
|
"""
|
|
query = """
|
|
SELECT DISTINCT
|
|
ir.iwa_id,
|
|
ir.iwa_title
|
|
FROM work_activities wa
|
|
JOIN iwa_reference ir ON wa.element_id = ir.element_id
|
|
WHERE wa.onetsoc_code = ?
|
|
ORDER BY ir.iwa_title;
|
|
"""
|
|
df = pd.read_sql_query(query, _conn, params=(onetsoc_code,))
|
|
return df
|
|
|
|
|
|
@st.cache_data
|
|
def get_dwas_for_iwas(_conn, iwa_ids):
|
|
"""Fetches DWAs for a list of IWA IDs."""
|
|
if not iwa_ids:
|
|
return pd.DataFrame()
|
|
placeholders = ",".join(
|
|
"?" for _ in iwa_ids
|
|
) # Create one placeholder for each IWA ID
|
|
query = f"""
|
|
SELECT DISTINCT
|
|
dr.dwa_id,
|
|
dr.dwa_title,
|
|
dr.iwa_id -- to link back to the IWA
|
|
FROM dwa_reference dr
|
|
WHERE dr.iwa_id IN ({placeholders})
|
|
ORDER BY dr.dwa_title;
|
|
"""
|
|
df = pd.read_sql_query(query, _conn, params=iwa_ids)
|
|
return df
|
|
|
|
|
|
@st.cache_data
|
|
def get_tasks_for_dwas(_conn, onetsoc_code, dwa_ids):
|
|
"""Fetches tasks for a given occupation and list of DWA IDs."""
|
|
if not dwa_ids:
|
|
return pd.DataFrame()
|
|
placeholders = ",".join(
|
|
"?" for _ in dwa_ids
|
|
) # Create one placeholder for each DWA ID
|
|
query = f"""
|
|
SELECT DISTINCT
|
|
ts.task_id,
|
|
ts.task,
|
|
t2d.dwa_id -- to link back to the DWA
|
|
FROM task_statements ts
|
|
JOIN tasks_to_dwas t2d ON ts.task_id = t2d.task_id
|
|
WHERE ts.onetsoc_code = ? AND t2d.dwa_id IN ({placeholders})
|
|
ORDER BY ts.task;
|
|
"""
|
|
# The parameters list should first contain onetsoc_code, then all DWA IDs.
|
|
params = [onetsoc_code] + dwa_ids
|
|
df = pd.read_sql_query(query, _conn, params=params)
|
|
return df
|
|
|
|
|
|
def smart_wrap(text, width=40):
|
|
"""Wraps text for better display in graph nodes."""
|
|
return "\n".join(
|
|
textwrap.wrap(
|
|
text,
|
|
width=width,
|
|
break_long_words=True,
|
|
replace_whitespace=False,
|
|
drop_whitespace=False,
|
|
)
|
|
)
|
|
|
|
|
|
# --- Streamlit App Layout ---
|
|
st.set_page_config(layout="wide")
|
|
|
|
# Check if database file exists
|
|
try:
|
|
# Attempt to open for binary read to check existence and basic readability
|
|
with open(DB_FILE, "rb") as f:
|
|
pass
|
|
conn = get_db_connection()
|
|
except FileNotFoundError:
|
|
st.error(
|
|
f"Database file '{DB_FILE}' not found. Please ensure it is in the same directory as the script."
|
|
)
|
|
st.stop()
|
|
except sqlite3.Error as e:
|
|
st.error(f"Error connecting to or reading the database '{DB_FILE}': {e}")
|
|
st.info(
|
|
"Please ensure the database file is a valid SQLite database and not corrupted."
|
|
)
|
|
st.stop()
|
|
|
|
|
|
st.title("O*NET Occupation Hierarchy Explorer")
|
|
st.markdown("""
|
|
This application visualizes the relationships between Occupations, Intermediate Work Activities (IWAs),
|
|
Detailed Work Activities (DWAs), and Task Statements from the O*NET database.
|
|
Select an occupation from the control panel on the left to view its hierarchical breakdown.
|
|
""")
|
|
|
|
# --- Sidebar for Occupation Selection ---
|
|
col1, col2 = st.columns([0.3, 0.7], gap="large")
|
|
|
|
with col1:
|
|
st.header("Control Panel")
|
|
occupations_df = get_occupations(conn)
|
|
|
|
if occupations_df.empty:
|
|
st.warning("No occupations found in the database.")
|
|
st.stop()
|
|
|
|
# Create a display string with code and title for the selectbox
|
|
occupations_df["display_name"] = (
|
|
occupations_df["title"] + " (" + occupations_df["onetsoc_code"] + ")"
|
|
)
|
|
|
|
search_term = st.text_input(
|
|
"Search for an occupation:", placeholder="E.g., Software Developer"
|
|
)
|
|
|
|
if search_term:
|
|
# Ensure search term is treated as a literal string for regex, if needed, or use basic string methods
|
|
search_term_safe = (
|
|
search_term.replace("[", "\\[")
|
|
.replace("]", "\\]")
|
|
.replace("(", "\\(")
|
|
.replace(")", "\\)")
|
|
)
|
|
filtered_occupations = occupations_df[
|
|
occupations_df["title"].str.contains(
|
|
search_term_safe, case=False, regex=True
|
|
)
|
|
| occupations_df["onetsoc_code"].str.contains(
|
|
search_term_safe, case=False, regex=True
|
|
)
|
|
]
|
|
else:
|
|
filtered_occupations = occupations_df
|
|
|
|
if not filtered_occupations.empty:
|
|
# Sort filtered occupations for consistent display in selectbox
|
|
filtered_occupations_sorted = filtered_occupations.sort_values("display_name")
|
|
selected_occupation_display_name = st.selectbox(
|
|
"Choose an occupation:",
|
|
options=filtered_occupations_sorted["display_name"],
|
|
index=0, # Default to the first item
|
|
)
|
|
|
|
# Get the onetsoc_code and title from the selected display name
|
|
selected_row = occupations_df[
|
|
occupations_df["display_name"] == selected_occupation_display_name
|
|
].iloc[0]
|
|
selected_onetsoc_code = selected_row["onetsoc_code"]
|
|
selected_occupation_title = selected_row["title"]
|
|
else:
|
|
st.warning("No occupations match your search term.")
|
|
selected_onetsoc_code = None
|
|
selected_occupation_title = None
|
|
|
|
# --- Main Area for Graph Display ---
|
|
with col2:
|
|
st.header("Occupation Graph")
|
|
if selected_onetsoc_code:
|
|
st.subheader(
|
|
f"Displaying: {selected_occupation_title} ({selected_onetsoc_code})"
|
|
)
|
|
|
|
iwas_df = get_iwas_for_occupation(conn, selected_onetsoc_code)
|
|
|
|
if iwas_df.empty:
|
|
st.info(
|
|
"No Intermediate Work Activities (IWAs) found directly linked for this occupation."
|
|
)
|
|
else:
|
|
graph = graphviz.Digraph(
|
|
comment=f"O*NET Hierarchy for {selected_onetsoc_code}"
|
|
)
|
|
graph.attr(
|
|
rankdir="LR",
|
|
splines="spline",
|
|
concentrate="false",
|
|
nodesep="0.5",
|
|
ranksep="0.8",
|
|
)
|
|
|
|
# Occupation Node
|
|
occ_node_id = f"occ_{selected_onetsoc_code.replace('.', '_')}" # Ensure ID is valid for DOT
|
|
occ_label = smart_wrap(
|
|
f"Occupation: {selected_occupation_title}\n({selected_onetsoc_code})",
|
|
width=30,
|
|
)
|
|
graph.node(
|
|
occ_node_id,
|
|
label=occ_label,
|
|
shape="ellipse",
|
|
style="filled",
|
|
fillcolor="skyblue",
|
|
)
|
|
|
|
# Fetch DWAs
|
|
iwa_ids = iwas_df["iwa_id"].tolist()
|
|
dwas_df = get_dwas_for_iwas(conn, iwa_ids)
|
|
|
|
dwa_ids_for_tasks = []
|
|
if not dwas_df.empty:
|
|
dwa_ids_for_tasks = dwas_df["dwa_id"].unique().tolist()
|
|
|
|
# Fetch Tasks
|
|
tasks_df = get_tasks_for_dwas(
|
|
conn, selected_onetsoc_code, dwa_ids_for_tasks
|
|
)
|
|
|
|
# Add IWA Nodes and Edges
|
|
for _, iwa_row in iwas_df.iterrows():
|
|
iwa_node_id = f"iwa_{str(iwa_row['iwa_id']).replace('.', '_')}"
|
|
iwa_label = smart_wrap(
|
|
f"IWA: {iwa_row['iwa_title']}\n(ID: {iwa_row['iwa_id']})", width=35
|
|
)
|
|
graph.node(
|
|
iwa_node_id,
|
|
label=iwa_label,
|
|
shape="box",
|
|
style="filled",
|
|
fillcolor="khaki",
|
|
)
|
|
graph.edge(occ_node_id, iwa_node_id)
|
|
|
|
# Add DWA Nodes and Edges (for this IWA)
|
|
current_iwa_dwas = dwas_df[dwas_df["iwa_id"] == iwa_row["iwa_id"]]
|
|
for _, dwa_row in current_iwa_dwas.iterrows():
|
|
dwa_node_id = f"dwa_{str(dwa_row['dwa_id']).replace('.', '_')}"
|
|
dwa_label = smart_wrap(
|
|
f"DWA: {dwa_row['dwa_title']}\n(ID: {dwa_row['dwa_id']})",
|
|
width=40,
|
|
)
|
|
graph.node(
|
|
dwa_node_id,
|
|
label=dwa_label,
|
|
shape="box",
|
|
style="filled",
|
|
fillcolor="lightcoral",
|
|
)
|
|
graph.edge(iwa_node_id, dwa_node_id)
|
|
|
|
# Add Task Nodes and Edges (for this DWA and Occupation)
|
|
current_dwa_tasks = tasks_df[
|
|
tasks_df["dwa_id"] == dwa_row["dwa_id"]
|
|
]
|
|
for _, task_row in current_dwa_tasks.iterrows():
|
|
# Ensure task_id is a string and valid for DOT
|
|
task_id_str = str(task_row["task_id"]).split(".")[
|
|
0
|
|
] # Handle decimal task_ids if they appear
|
|
task_node_id = f"task_{task_id_str}"
|
|
task_label = smart_wrap(
|
|
f"Task: {task_row['task']}\n(ID: {task_id_str})", width=50
|
|
)
|
|
graph.node(
|
|
task_node_id,
|
|
label=task_label,
|
|
shape="note",
|
|
style="filled",
|
|
fillcolor="lightgray",
|
|
)
|
|
graph.edge(dwa_node_id, task_node_id)
|
|
|
|
if (
|
|
not graph.body or len(graph.body) <= 1
|
|
): # Check if any nodes were actually added beyond the occupation
|
|
st.info(
|
|
"No hierarchical data (IWAs, DWAs, Tasks) to display for this occupation after initial selection."
|
|
)
|
|
else:
|
|
try:
|
|
st.graphviz_chart(graph, use_container_width=True)
|
|
with st.expander("View Data Tables for Selected Occupation"):
|
|
st.markdown("##### Intermediate Work Activities (IWAs)")
|
|
st.dataframe(iwas_df, use_container_width=True)
|
|
if not dwas_df.empty:
|
|
st.markdown("##### Detailed Work Activities (DWAs)")
|
|
st.dataframe(dwas_df, use_container_width=True)
|
|
if not tasks_df.empty:
|
|
st.markdown("##### Task Statements")
|
|
st.dataframe(tasks_df, use_container_width=True)
|
|
|
|
except Exception as e:
|
|
st.error(
|
|
f"Could not render the graph. Graphviz might not be installed correctly or there's an issue with the graph data: {e}"
|
|
)
|
|
st.text("Graphviz DOT source (for debugging):")
|
|
st.code(graph.source, language="dot")
|
|
else:
|
|
st.info("Select an occupation from the control panel to see its graph.")
|
|
|
|
# Instructions to run the app:
|
|
# 1. Save this code as a Python file (e.g., onet_explorer_app.py).
|
|
# 2. Ensure the 'onet.database' file is in the same directory.
|
|
# 3. Install the required libraries: pip install streamlit pandas graphviz
|
|
# 4. Open your terminal or command prompt, navigate to the directory, and run:
|
|
# streamlit run onet_explorer_app.py
|