wip

2025-04-28 05:02:28 +02:00 · 2025-04-28 05:02:28 +02:00 · 720f21a85b
commit 720f21a85b
parent 19bf2e6b18
10 changed files with 11122 additions and 356 deletions
--- a/enrich_task_ratings.py
+++ b/enrich_task_ratings.py
@ -3,35 +3,38 @@ import pandas as pd
 import json
 import os
 from collections import defaultdict
-import numpy as np  # Import numpy for nan handling if necessary
+import numpy as np

 # --- Configuration ---
 DB_FILE = "onet.database"
-OUTPUT_FILE = "task_ratings_enriched.json"
+OUTPUT_FILE = "task_ratings_enriched.json"  # Changed output filename

 # --- Database Interaction ---


 def fetch_data_from_db(db_path):
    """
-    Fetches required data from the O*NET SQLite database using JOINs.
+    Fetches required data from the O*NET SQLite database using JOINs,
+    including DWAs.

    Args:
        db_path (str): Path to the SQLite database file.

    Returns:
-        pandas.DataFrame: DataFrame containing joined data from task_ratings,
-                          task_statements, and occupation_data.
-        Returns None if the database file doesn't exist or an error occurs.
+        tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
+            - DataFrame with task ratings info.
+            - DataFrame with task-to-DWA mapping.
+        Returns (None, None) if the database file doesn't exist or an error occurs.
    """
    if not os.path.exists(db_path):
        print(f"Error: Database file not found at {db_path}")
-        return None
+        return None, None

    try:
        conn = sqlite3.connect(db_path)
        # Construct the SQL query to join the tables and select necessary columns
-        # We select all relevant columns needed for processing.
+        # Added LEFT JOINs for tasks_to_dwas and dwa_reference
+        # Use LEFT JOIN in case a task has no DWAs
        query = """
        SELECT
            tr.onetsoc_code,
@ -41,136 +44,277 @@ def fetch_data_from_db(db_path):
            od.description AS occupation_description,
            tr.scale_id,
            tr.category,
-            tr.data_value
+            tr.data_value,
+            dr.dwa_title  -- Added DWA title
        FROM
            task_ratings tr
        JOIN
            task_statements ts ON tr.task_id = ts.task_id
        JOIN
-            occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
+            occupation_data od ON tr.onetsoc_code = od.onetsoc_code
+        LEFT JOIN
+            tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
+        LEFT JOIN
+            dwa_reference dr ON td.dwa_id = dr.dwa_id; --
        """
        df = pd.read_sql_query(query, conn)
        conn.close()
-        print(f"Successfully fetched {len(df)} records from the database.")
-        return df
-    except sqlite3.Error as e:
-        print(f"SQLite error: {e}")
-        if conn:
-            conn.close()
-        return None
-    except Exception as e:
-        print(f"An error occurred during data fetching: {e}")
-        if "conn" in locals() and conn:
-            conn.close()
-        return None
+        print(
+            f"Successfully fetched {len(df)} records (including DWA info) from the database."
+        )

-
-# --- Data Processing ---
-
-
-def process_task_ratings(df):
-    """
-    Processes the fetched data to group, pivot frequency, calculate averages,
-    and structure the output.
-
-    Args:
-        df (pandas.DataFrame): The input DataFrame with joined data.
-
-    Returns:
-        list: A list of dictionaries, each representing an enriched task rating.
-              Returns None if the input DataFrame is invalid.
-    """
-    if df is None or df.empty:
-        print("Error: Input DataFrame is empty or invalid.")
-        return None
-
-    print("Starting data processing...")
-
-    # --- 1. Handle Frequency (FT) ---
-    # Filter for Frequency ratings
-    freq_df = df[df["scale_id"] == "FT"].copy()
-    # Pivot the frequency data: index by task and occupation, columns by category
-    # We fill missing frequency values with 0, assuming no rating means 0% for that category.
-    freq_pivot = freq_df.pivot_table(
-        index=["onetsoc_code", "task_id"],
-        columns="category",
-        values="data_value",
-        fill_value=0,  # Fill missing categories for a task/occupation with 0
-    )
-    # Rename columns for clarity using the requested format
-    freq_pivot.columns = [
-        f"frequency_category_{int(col)}" for col in freq_pivot.columns
-    ]  # <-- UPDATED LINE
-    print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
-
-    # --- 2. Handle Importance (IM, IJ) ---
-    # Filter for Importance ratings
-    imp_df = df[df["scale_id"].isin(["IM", "IJ"])].copy()
-    # Group by task and occupation, calculate the mean importance
-    # Using np.nanmean to handle potential NaN values gracefully if any exist
-    imp_avg = (
-        imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
-    )
-    imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
-    print(f"Processed Importance data. Shape: {imp_avg.shape}")
-
-    # --- 3. Handle Relevance (RT) ---
-    # Filter for Relevance ratings
-    rel_df = df[df["scale_id"] == "RT"].copy()
-    # Group by task and occupation, calculate the mean relevance
-    rel_avg = (
-        rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
-    )
-    rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
-    print(f"Processed Relevance data. Shape: {rel_avg.shape}")
-
-    # --- 4. Get Base Task/Occupation Info ---
-    # Select unique combinations of task and occupation details
-    base_info = (
-        df[
-            [
+        if df.empty:
+            print("Warning: Fetched DataFrame is empty.")
+            # Return empty DataFrames with expected columns if the main fetch is empty
+            ratings_cols = [
                "onetsoc_code",
                "task_id",
                "task",
                "occupation_title",
                "occupation_description",
+                "scale_id",
+                "category",
+                "data_value",
            ]
-        ]
-        .drop_duplicates()
-        .set_index(["onetsoc_code", "task_id"])
-    )
-    print(f"Extracted base info. Shape: {base_info.shape}")
+            dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
+            return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)

-    # --- 5. Merge Processed Data ---
-    # Start with the base info and merge the calculated/pivoted data
-    # Use 'left' joins to ensure all tasks/occupations from the base_info are kept.
-    # If a task/occupation doesn't have frequency, importance, or relevance ratings,
-    # the corresponding columns will have NaN values after the merge.
+        # Remove duplicates caused by joining ratings with potentially multiple DWAs per task
+        # Keep only unique combinations of the core task/rating info before processing
+        core_cols = [
+            "onetsoc_code",
+            "task_id",
+            "task",
+            "occupation_title",
+            "occupation_description",
+            "scale_id",
+            "category",
+            "data_value",
+        ]
+        # Check if all core columns exist before attempting to drop duplicates
+        missing_core_cols = [col for col in core_cols if col not in df.columns]
+        if missing_core_cols:
+            print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
+            return None, None
+        ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
+
+        # Get unique DWA info separately
+        dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
+        # Check if all DWA columns exist before processing
+        if all(col in df.columns for col in dwa_cols):
+            dwas_df = (
+                df[dwa_cols]
+                .dropna(subset=["dwa_title"])
+                .drop_duplicates()
+                .reset_index(drop=True)
+            )
+        else:
+            print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
+            dwas_df = pd.DataFrame(
+                columns=dwa_cols
+            )  # Create empty df if columns missing
+
+        return ratings_df, dwas_df  # Return two dataframes now
+
+    except sqlite3.Error as e:
+        print(f"SQLite error: {e}")
+        if "conn" in locals() and conn:
+            conn.close()
+        return None, None  # Return None for both if error
+    except Exception as e:
+        print(f"An error occurred during data fetching: {e}")
+        if "conn" in locals() and conn:
+            conn.close()
+        return None, None  # Return None for both if error
+
+
+# --- Data Processing ---
+
+
+def process_task_ratings_with_dwas(ratings_df, dwas_df):
+    """
+    Processes the fetched data to group, pivot frequency, calculate averages,
+    structure the output, and add associated DWAs.
+
+    Args:
+        ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
+        dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
+
+    Returns:
+        list: A list of dictionaries, each representing an enriched task rating with DWAs.
+              Returns None if the input ratings DataFrame is invalid.
+    """
+    if ratings_df is None or not isinstance(
+        ratings_df, pd.DataFrame
+    ):  # Check if it's a DataFrame
+        print("Error: Input ratings DataFrame is invalid.")
+        return None
+    if ratings_df.empty:
+        print(
+            "Warning: Input ratings DataFrame is empty. Processing will yield empty result."
+        )
+        # Decide how to handle empty input, maybe return empty list directly
+        # return []
+
+    # Ensure dwas_df is a DataFrame, even if empty
+    if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
+        print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
+        dwas_df = pd.DataFrame(
+            columns=["onetsoc_code", "task_id", "dwa_title"]
+        )  # Ensure it's an empty DF
+
+    print("Starting data processing...")
+
+    # --- 1. Handle Frequency (FT) ---
+    freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
+    if not freq_df.empty:
+        freq_pivot = freq_df.pivot_table(
+            index=["onetsoc_code", "task_id"],
+            columns="category",
+            values="data_value",
+            fill_value=0,
+        )
+        freq_pivot.columns = [
+            f"frequency_category_{int(col)}" for col in freq_pivot.columns
+        ]
+        print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
+    else:
+        print("No Frequency (FT) data found.")
+        # Create an empty DataFrame with the multi-index to allow merging later
+        idx = pd.MultiIndex(
+            levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
+        )
+        freq_pivot = pd.DataFrame(index=idx)
+
+    # --- 2. Handle Importance (IM, IJ) ---
+    imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
+    if not imp_df.empty:
+        imp_avg = (
+            imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
+            .mean()
+            .reset_index()
+        )
+        imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
+        print(f"Processed Importance data. Shape: {imp_avg.shape}")
+    else:
+        print("No Importance (IM, IJ) data found.")
+        imp_avg = pd.DataFrame(
+            columns=["onetsoc_code", "task_id", "importance_average"]
+        )
+
+    # --- 3. Handle Relevance (RT) ---
+    rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
+    if not rel_df.empty:
+        rel_avg = (
+            rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
+            .mean()
+            .reset_index()
+        )
+        rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
+        print(f"Processed Relevance data. Shape: {rel_avg.shape}")
+    else:
+        print("No Relevance (RT) data found.")
+        rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
+
+    # --- 4. Process DWAs ---
+    if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
+        print("Processing DWA data...")
+        # Group DWAs by task_id and aggregate titles into a list
+        dwas_grouped = (
+            dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
+            .apply(list)
+            .reset_index()
+        )  #
+        dwas_grouped.rename(
+            columns={"dwa_title": "dwas"}, inplace=True
+        )  # Rename column to 'dwas'
+        print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
+    else:
+        print("No valid DWA data found or provided for processing.")
+        dwas_grouped = None  # Set to None if no DWAs
+
+    # --- 5. Get Base Task/Occupation Info ---
+    base_cols = [
+        "onetsoc_code",
+        "task_id",
+        "task",
+        "occupation_title",
+        "occupation_description",
+    ]
+    # Check if base columns exist in ratings_df
+    missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
+    if missing_base_cols:
+        print(
+            f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
+        )
+        return None
+    if not ratings_df.empty:
+        base_info = (
+            ratings_df[base_cols]
+            .drop_duplicates()
+            .set_index(["onetsoc_code", "task_id"])
+        )
+        print(f"Extracted base info. Shape: {base_info.shape}")
+    else:
+        print("Cannot extract base info from empty ratings DataFrame.")
+        # Create an empty df with index to avoid errors later if possible
+        idx = pd.MultiIndex(
+            levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
+        )
+        base_info = pd.DataFrame(
+            index=idx,
+            columns=[
+                col for col in base_cols if col not in ["onetsoc_code", "task_id"]
+            ],
+        )
+
+    # --- 6. Merge Processed Data ---
    print("Merging processed data...")
+    # Start with base_info, which should have the index ['onetsoc_code', 'task_id']
    final_df = base_info.merge(
        freq_pivot, left_index=True, right_index=True, how="left"
    )
-    # Set index before merging averages which are not multi-indexed
+    # Reset index before merging non-indexed dfs
    final_df = final_df.reset_index()
-    final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
-    final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")

-    # Fill potential NaN values resulting from left joins if needed.
-    # For averages, NaN might mean no rating was provided. We can leave them as NaN
-    # or fill with 0 or another placeholder depending on desired interpretation.
-    # For frequency categories, NaN could mean that category wasn't rated. We filled with 0 during pivot.
-    # Example: Fill NaN averages with 0
-    # final_df['importance_average'].fillna(0, inplace=True)
-    # final_df['relevance_average'].fillna(0, inplace=True)
-    # Note: Leaving NaNs might be more informative.
+    # Merge averages - check if they are not empty before merging
+    if not imp_avg.empty:
+        final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
+    else:
+        final_df["importance_average"] = np.nan  # Add column if imp_avg was empty
+
+    if not rel_avg.empty:
+        final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
+    else:
+        final_df["relevance_average"] = np.nan  # Add column if rel_avg was empty
+
+    # Merge DWAs if available
+    if dwas_grouped is not None and not dwas_grouped.empty:
+        final_df = final_df.merge(
+            dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
+        )  # Merge the dwas list
+        # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
+        # Check if 'dwas' column exists before applying function
+        if "dwas" in final_df.columns:
+            final_df["dwas"] = final_df["dwas"].apply(
+                lambda x: x if isinstance(x, list) else []
+            )  # Ensure tasks without DWAs get []
+        else:
+            print("Warning: 'dwas' column not created during merge.")
+            final_df["dwas"] = [
+                [] for _ in range(len(final_df))
+            ]  # Add empty list column
+
+    else:
+        # Add an empty 'dwas' column if no DWA data was processed or merged
+        final_df["dwas"] = [[] for _ in range(len(final_df))]

    print(f"Final merged data shape: {final_df.shape}")

    # Convert DataFrame to list of dictionaries for JSON output
    # Handle potential NaN values during JSON conversion
-    final_df = final_df.replace(
-        {np.nan: None}
-    )  # Replace numpy NaN with Python None for JSON compatibility
+    # Replace numpy NaN with Python None for JSON compatibility
+    final_df = final_df.replace({np.nan: None})
    result_list = final_df.to_dict(orient="records")

    return result_list
@ -190,13 +334,30 @@ def write_to_json(data, output_path):
    if data is None:
        print("No data to write to JSON.")
        return
+    if not isinstance(data, list):
+        print(
+            f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
+        )
+        return
+
+    # Create directory if it doesn't exist
+    output_dir = os.path.dirname(output_path)
+    if output_dir and not os.path.exists(output_dir):
+        try:
+            os.makedirs(output_dir)
+            print(f"Created output directory: {output_dir}")
+        except OSError as e:
+            print(f"Error creating output directory {output_dir}: {e}")
+            return  # Exit if cannot create directory

    try:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"Successfully wrote enriched data to {output_path}")
    except IOError as e:
-        print(f"Error writing JSON file: {e}")
+        print(f"Error writing JSON file to {output_path}: {e}")
+    except TypeError as e:
+        print(f"Error during JSON serialization: {e}. Check data types.")
    except Exception as e:
        print(f"An unexpected error occurred during JSON writing: {e}")

@ -204,20 +365,28 @@ def write_to_json(data, output_path):
 # --- Main Execution ---

 if __name__ == "__main__":
-    print("Starting O*NET Task Ratings Enrichment Script...")
+    print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
    # 1. Fetch data
-    raw_data_df = fetch_data_from_db(DB_FILE)
+    ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE)  # Fetch both datasets

    # 2. Process data
-    if raw_data_df is not None:
-        enriched_data = process_task_ratings(raw_data_df)
+    # Proceed only if ratings_data_df is a valid DataFrame (even if empty)
+    # dwas_data_df can be None or empty, handled inside process function
+    if isinstance(ratings_data_df, pd.DataFrame):
+        enriched_data = process_task_ratings_with_dwas(
+            ratings_data_df, dwas_data_df
+        )  # Pass both dataframes

        # 3. Write output
-        if enriched_data:
+        if (
+            enriched_data is not None
+        ):  # Check if processing returned data (even an empty list is valid)
            write_to_json(enriched_data, OUTPUT_FILE)
        else:
-            print("Data processing failed. No output file generated.")
+            print("Data processing failed or returned None. No output file generated.")
    else:
-        print("Data fetching failed. Script terminated.")
+        print(
+            "Data fetching failed or returned invalid type for ratings data. Script terminated."
+        )

    print("Script finished.")