progress

2025-04-26 23:38:19 +02:00 · 2025-04-26 23:38:19 +02:00 · 19bf2e6b18
commit 19bf2e6b18
parent 8c0b53a32c
9 changed files with 2675 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1 @@
 export OPENAI_API_KEY=
--- a/.envrc
+++ b/.envrc
@ -1 +1 @@
-use flake
+use flake .#impure
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
 epoch_task_data.csv
 oesm23national.xlsx
 onet.database*
 onet_occupation_data.json
 schema.sql
 task_ratings_enriched.json
 .env
 .ipynb_checkpoints
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
@ -0,0 +1,641 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "beace815-b5ae-44a4-a81c-a7f82cb66296",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2K\u001b[2mResolved \u001b[1m118 packages\u001b[0m \u001b[2min 386ms\u001b[0m\u001b[0m                                       \u001b[0m\n",
      "\u001b[2K\u001b[2mPrepared \u001b[1m2 packages\u001b[0m \u001b[2min 124ms\u001b[0m\u001b[0m                                             \n",
      "\u001b[2K\u001b[2mInstalled \u001b[1m2 packages\u001b[0m \u001b[2min 5ms\u001b[0m\u001b[0m                                 \u001b[0m\n",
      " \u001b[32m+\u001b[39m \u001b[1met-xmlfile\u001b[0m\u001b[2m==2.0.0\u001b[0m\n",
      " \u001b[32m+\u001b[39m \u001b[1mopenpyxl\u001b[0m\u001b[2m==3.1.5\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!uv add pandas requests openai dotenv openpyxl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "941d511f-ad72-4306-bbab-52127583e513",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import dotenv\n",
    "import openai\n",
    "import sqlite3\n",
    "import pandas as pd\n",
    "\n",
    "dotenv.load_dotenv() # Copy .env.example to .env and fill in the blanks\n",
    "\n",
    "oai_token = os.getenv(\"OPENAI_API_KEY\")\n",
    "\n",
    "oai = openai.OpenAI(api_key=oai_token)\n",
    "onet = sqlite3.connect(\"onet.database\") # Run ./create_onet_database.sh to create it\n",
    "# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work\n",
    "# It contains labels for whethere a O*NET task can be done remotely or not (labeled by GPT-4o)\n",
    "# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing\n",
    "df_remote_status = pd.read_csv(\"epoch_task_data.csv\")\n",
    "\n",
    "# BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip\n",
    "df_oesm = pd.read_excel(\"oesm23national.xlsx\")\n",
    "\n",
    "# Run uv run enrich_task_ratings.py to get this file (trs = Task RatingS)\n",
    "df_enriched_trs = pd.read_json(\"task_ratings_enriched.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a5351f8b-c2ad-4d3e-af4a-992f539a6064",
   "metadata": {},
   "outputs": [],
   "source": [
    "FREQUENCY_MAP = {\n",
    "    'frequency_category_1': \"Yearly or less\",\n",
    "    'frequency_category_2': \"More than yearly\",\n",
    "    'frequency_category_3': \"More than monthly\",\n",
    "    'frequency_category_4': \"More than weekly\",\n",
    "    'frequency_category_5': \"Daily\",\n",
    "    'frequency_category_6': \"Several times daily\",\n",
    "    'frequency_category_7': \"Hourly or more\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8b2ab22a-afab-41f9-81a3-48eab261b568",
   "metadata": {},
   "outputs": [],
   "source": [
    "background_prompt = '''\n",
    "Estimate the typical duration to complete *one instance* of the following job task from the moment a person starts to work on it to the last moment the person will need to keep it in mind\n",
    "\n",
    "Take into account that there might be delays between the steps to complete the task, which would lengthen the estimate.\n",
    "\n",
    "Output a range with the format [duration A] - [duration B] where [duration A] and [duration B] correspond to one of the durations below:\n",
    "- less than 30 minutes\n",
    "- 30 minutes\n",
    "- 1 hour\n",
    "- 4 hours\n",
    "- 8 hours\n",
    "- 16 hours\n",
    "- 3 days\n",
    "- 1 week\n",
    "- 3 weeks\n",
    "- 6 weeks\n",
    "- 3 months\n",
    "- 6 months\n",
    "- 1 year\n",
    "- 3 years\n",
    "- more than 3 year\n",
    "\n",
    "**Do not output anything besides the range**\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d2e4a855-f327-4b3d-ad0b-ed997e720639",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
    "df_enriched_trs['occ_code_join'] = df_enriched_trs['onetsoc_code'].str[:7]\n",
    "df_merged = pd.merge(\n",
    "    df_enriched_trs,\n",
    "    df_oesm_detailed,\n",
    "    left_on='occ_code_join',\n",
    "    right_on='OCC_CODE',\n",
    "    how='left'\n",
    ")\n",
    "df_merged = df_merged.drop(columns=['occ_code_join'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9be7acb5-2374-4f61-bba3-13b0077c0bd2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Task:  Develop or recommend network security measures, such as firewalls, network security audits, or automated security probes.\n",
      "Occupation Description:  Design and implement computer and information networks, such as local area networks (LAN), wide area networks (WAN), intranets, extranets, and other data communications networks. Perform network modeling, analysis, and planning, including analysis of capacity needs for network infrastructures. May also design network and computer security measures. May research and recommend network and data communications hardware and software.\n",
      "Occupation Title:  Computer Network Architects\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "onetsoc_code                                                     15-1241.00\n",
       "task_id                                                               18971\n",
       "task                      Develop or recommend network security measures...\n",
       "occupation_title                                Computer Network Architects\n",
       "occupation_description    Design and implement computer and information ...\n",
       "Yearly or less                                                          0.0\n",
       "More than yearly                                                       30.0\n",
       "More than monthly                                                      15.0\n",
       "More than weekly                                                       20.0\n",
       "Daily                                                                  15.0\n",
       "Several times daily                                                    15.0\n",
       "Hourly or more                                                          5.0\n",
       "importance_average                                                     4.35\n",
       "relevance_average                                                     100.0\n",
       "occ_code_join                                                       15-1241\n",
       "remote                                                               remote\n",
       "Name: 45200, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "df_merged = pd \\\n",
    "    .merge(left=df_enriched_trs, right=df_remote_status[['O*NET-SOC Code', 'Remote']], how='left', left_on='onetsoc_code', right_on='O*NET-SOC Code') \\\n",
    "    .drop(columns=['O*NET-SOC Code']) \\\n",
    "    .rename(columns={'Remote': 'remote'}) \\\n",
    "    .rename(columns=FREQUENCY_MAP) \\\n",
    "    .query('remote == \"remote\" and importance_average >= 3')\n",
    "\n",
    "row = df_merged.iloc[30000]\n",
    "print('Task: ', row['task'])\n",
    "print('Occupation Description: ', row['occupation_description'])\n",
    "print('Occupation Title: ', row['occupation_title'])\n",
    "\n",
    "row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9e5ea89f-2c18-459d-851d-dacb379f4a2e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>onetsoc_code</th>\n",
       "      <th>task_id</th>\n",
       "      <th>task</th>\n",
       "      <th>occupation_title</th>\n",
       "      <th>occupation_description</th>\n",
       "      <th>Yearly or less</th>\n",
       "      <th>More than yearly</th>\n",
       "      <th>More than monthly</th>\n",
       "      <th>More than weekly</th>\n",
       "      <th>Daily</th>\n",
       "      <th>Several times daily</th>\n",
       "      <th>Hourly or more</th>\n",
       "      <th>importance_average</th>\n",
       "      <th>relevance_average</th>\n",
       "      <th>remote</th>\n",
       "      <th>OCC_CODE</th>\n",
       "      <th>TOT_EMP</th>\n",
       "      <th>H_MEAN</th>\n",
       "      <th>A_MEAN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11-1011.00</td>\n",
       "      <td>8823</td>\n",
       "      <td>Direct or coordinate an organization's financi...</td>\n",
       "      <td>Chief Executives</td>\n",
       "      <td>Determine and formulate policies and provide o...</td>\n",
       "      <td>5.92</td>\n",
       "      <td>15.98</td>\n",
       "      <td>29.68</td>\n",
       "      <td>21.18</td>\n",
       "      <td>19.71</td>\n",
       "      <td>4.91</td>\n",
       "      <td>2.63</td>\n",
       "      <td>4.52</td>\n",
       "      <td>74.44</td>\n",
       "      <td>remote</td>\n",
       "      <td>11-1011</td>\n",
       "      <td>211230.0</td>\n",
       "      <td>124.47</td>\n",
       "      <td>258900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11-1011.00</td>\n",
       "      <td>8823</td>\n",
       "      <td>Direct or coordinate an organization's financi...</td>\n",
       "      <td>Chief Executives</td>\n",
       "      <td>Determine and formulate policies and provide o...</td>\n",
       "      <td>5.92</td>\n",
       "      <td>15.98</td>\n",
       "      <td>29.68</td>\n",
       "      <td>21.18</td>\n",
       "      <td>19.71</td>\n",
       "      <td>4.91</td>\n",
       "      <td>2.63</td>\n",
       "      <td>4.52</td>\n",
       "      <td>74.44</td>\n",
       "      <td>remote</td>\n",
       "      <td>11-1011</td>\n",
       "      <td>211230.0</td>\n",
       "      <td>124.47</td>\n",
       "      <td>258900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>11-1011.00</td>\n",
       "      <td>8823</td>\n",
       "      <td>Direct or coordinate an organization's financi...</td>\n",
       "      <td>Chief Executives</td>\n",
       "      <td>Determine and formulate policies and provide o...</td>\n",
       "      <td>5.92</td>\n",
       "      <td>15.98</td>\n",
       "      <td>29.68</td>\n",
       "      <td>21.18</td>\n",
       "      <td>19.71</td>\n",
       "      <td>4.91</td>\n",
       "      <td>2.63</td>\n",
       "      <td>4.52</td>\n",
       "      <td>74.44</td>\n",
       "      <td>remote</td>\n",
       "      <td>11-1011</td>\n",
       "      <td>211230.0</td>\n",
       "      <td>124.47</td>\n",
       "      <td>258900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11-1011.00</td>\n",
       "      <td>8823</td>\n",
       "      <td>Direct or coordinate an organization's financi...</td>\n",
       "      <td>Chief Executives</td>\n",
       "      <td>Determine and formulate policies and provide o...</td>\n",
       "      <td>5.92</td>\n",
       "      <td>15.98</td>\n",
       "      <td>29.68</td>\n",
       "      <td>21.18</td>\n",
       "      <td>19.71</td>\n",
       "      <td>4.91</td>\n",
       "      <td>2.63</td>\n",
       "      <td>4.52</td>\n",
       "      <td>74.44</td>\n",
       "      <td>remote</td>\n",
       "      <td>11-1011</td>\n",
       "      <td>211230.0</td>\n",
       "      <td>124.47</td>\n",
       "      <td>258900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11-1011.00</td>\n",
       "      <td>8823</td>\n",
       "      <td>Direct or coordinate an organization's financi...</td>\n",
       "      <td>Chief Executives</td>\n",
       "      <td>Determine and formulate policies and provide o...</td>\n",
       "      <td>5.92</td>\n",
       "      <td>15.98</td>\n",
       "      <td>29.68</td>\n",
       "      <td>21.18</td>\n",
       "      <td>19.71</td>\n",
       "      <td>4.91</td>\n",
       "      <td>2.63</td>\n",
       "      <td>4.52</td>\n",
       "      <td>74.44</td>\n",
       "      <td>remote</td>\n",
       "      <td>11-1011</td>\n",
       "      <td>211230.0</td>\n",
       "      <td>124.47</td>\n",
       "      <td>258900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127653</th>\n",
       "      <td>53-7121.00</td>\n",
       "      <td>12807</td>\n",
       "      <td>Unload cars containing liquids by connecting h...</td>\n",
       "      <td>Tank Car, Truck, and Ship Loaders</td>\n",
       "      <td>Load and unload chemicals and bulk solids, suc...</td>\n",
       "      <td>6.05</td>\n",
       "      <td>29.21</td>\n",
       "      <td>6.88</td>\n",
       "      <td>13.95</td>\n",
       "      <td>27.65</td>\n",
       "      <td>7.93</td>\n",
       "      <td>8.34</td>\n",
       "      <td>4.08</td>\n",
       "      <td>64.04</td>\n",
       "      <td>remote</td>\n",
       "      <td>53-7121</td>\n",
       "      <td>11400.0</td>\n",
       "      <td>29.1</td>\n",
       "      <td>60530</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127654</th>\n",
       "      <td>53-7121.00</td>\n",
       "      <td>12804</td>\n",
       "      <td>Clean interiors of tank cars or tank trucks, u...</td>\n",
       "      <td>Tank Car, Truck, and Ship Loaders</td>\n",
       "      <td>Load and unload chemicals and bulk solids, suc...</td>\n",
       "      <td>1.47</td>\n",
       "      <td>6.33</td>\n",
       "      <td>21.70</td>\n",
       "      <td>25.69</td>\n",
       "      <td>32.35</td>\n",
       "      <td>12.47</td>\n",
       "      <td>0.00</td>\n",
       "      <td>4.02</td>\n",
       "      <td>44.33</td>\n",
       "      <td>remote</td>\n",
       "      <td>53-7121</td>\n",
       "      <td>11400.0</td>\n",
       "      <td>29.1</td>\n",
       "      <td>60530</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127655</th>\n",
       "      <td>53-7121.00</td>\n",
       "      <td>12803</td>\n",
       "      <td>Lower gauge rods into tanks or read meters to ...</td>\n",
       "      <td>Tank Car, Truck, and Ship Loaders</td>\n",
       "      <td>Load and unload chemicals and bulk solids, suc...</td>\n",
       "      <td>4.52</td>\n",
       "      <td>1.76</td>\n",
       "      <td>4.65</td>\n",
       "      <td>17.81</td>\n",
       "      <td>37.42</td>\n",
       "      <td>23.31</td>\n",
       "      <td>10.55</td>\n",
       "      <td>3.88</td>\n",
       "      <td>65.00</td>\n",
       "      <td>remote</td>\n",
       "      <td>53-7121</td>\n",
       "      <td>11400.0</td>\n",
       "      <td>29.1</td>\n",
       "      <td>60530</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127656</th>\n",
       "      <td>53-7121.00</td>\n",
       "      <td>12805</td>\n",
       "      <td>Operate conveyors and equipment to transfer gr...</td>\n",
       "      <td>Tank Car, Truck, and Ship Loaders</td>\n",
       "      <td>Load and unload chemicals and bulk solids, suc...</td>\n",
       "      <td>6.97</td>\n",
       "      <td>12.00</td>\n",
       "      <td>2.52</td>\n",
       "      <td>5.90</td>\n",
       "      <td>35.48</td>\n",
       "      <td>22.08</td>\n",
       "      <td>15.05</td>\n",
       "      <td>3.87</td>\n",
       "      <td>47.90</td>\n",
       "      <td>remote</td>\n",
       "      <td>53-7121</td>\n",
       "      <td>11400.0</td>\n",
       "      <td>29.1</td>\n",
       "      <td>60530</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>127657</th>\n",
       "      <td>53-7121.00</td>\n",
       "      <td>12810</td>\n",
       "      <td>Perform general warehouse activities, such as ...</td>\n",
       "      <td>Tank Car, Truck, and Ship Loaders</td>\n",
       "      <td>Load and unload chemicals and bulk solids, suc...</td>\n",
       "      <td>5.91</td>\n",
       "      <td>10.85</td>\n",
       "      <td>6.46</td>\n",
       "      <td>14.46</td>\n",
       "      <td>34.14</td>\n",
       "      <td>16.39</td>\n",
       "      <td>11.78</td>\n",
       "      <td>3.53</td>\n",
       "      <td>47.84</td>\n",
       "      <td>remote</td>\n",
       "      <td>53-7121</td>\n",
       "      <td>11400.0</td>\n",
       "      <td>29.1</td>\n",
       "      <td>60530</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>127658 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       onetsoc_code  task_id  \\\n",
       "0        11-1011.00     8823   \n",
       "1        11-1011.00     8823   \n",
       "2        11-1011.00     8823   \n",
       "3        11-1011.00     8823   \n",
       "4        11-1011.00     8823   \n",
       "...             ...      ...   \n",
       "127653   53-7121.00    12807   \n",
       "127654   53-7121.00    12804   \n",
       "127655   53-7121.00    12803   \n",
       "127656   53-7121.00    12805   \n",
       "127657   53-7121.00    12810   \n",
       "\n",
       "                                                     task  \\\n",
       "0       Direct or coordinate an organization's financi...   \n",
       "1       Direct or coordinate an organization's financi...   \n",
       "2       Direct or coordinate an organization's financi...   \n",
       "3       Direct or coordinate an organization's financi...   \n",
       "4       Direct or coordinate an organization's financi...   \n",
       "...                                                   ...   \n",
       "127653  Unload cars containing liquids by connecting h...   \n",
       "127654  Clean interiors of tank cars or tank trucks, u...   \n",
       "127655  Lower gauge rods into tanks or read meters to ...   \n",
       "127656  Operate conveyors and equipment to transfer gr...   \n",
       "127657  Perform general warehouse activities, such as ...   \n",
       "\n",
       "                         occupation_title  \\\n",
       "0                        Chief Executives   \n",
       "1                        Chief Executives   \n",
       "2                        Chief Executives   \n",
       "3                        Chief Executives   \n",
       "4                        Chief Executives   \n",
       "...                                   ...   \n",
       "127653  Tank Car, Truck, and Ship Loaders   \n",
       "127654  Tank Car, Truck, and Ship Loaders   \n",
       "127655  Tank Car, Truck, and Ship Loaders   \n",
       "127656  Tank Car, Truck, and Ship Loaders   \n",
       "127657  Tank Car, Truck, and Ship Loaders   \n",
       "\n",
       "                                   occupation_description  Yearly or less  \\\n",
       "0       Determine and formulate policies and provide o...            5.92   \n",
       "1       Determine and formulate policies and provide o...            5.92   \n",
       "2       Determine and formulate policies and provide o...            5.92   \n",
       "3       Determine and formulate policies and provide o...            5.92   \n",
       "4       Determine and formulate policies and provide o...            5.92   \n",
       "...                                                   ...             ...   \n",
       "127653  Load and unload chemicals and bulk solids, suc...            6.05   \n",
       "127654  Load and unload chemicals and bulk solids, suc...            1.47   \n",
       "127655  Load and unload chemicals and bulk solids, suc...            4.52   \n",
       "127656  Load and unload chemicals and bulk solids, suc...            6.97   \n",
       "127657  Load and unload chemicals and bulk solids, suc...            5.91   \n",
       "\n",
       "        More than yearly  More than monthly  More than weekly  Daily  \\\n",
       "0                  15.98              29.68             21.18  19.71   \n",
       "1                  15.98              29.68             21.18  19.71   \n",
       "2                  15.98              29.68             21.18  19.71   \n",
       "3                  15.98              29.68             21.18  19.71   \n",
       "4                  15.98              29.68             21.18  19.71   \n",
       "...                  ...                ...               ...    ...   \n",
       "127653             29.21               6.88             13.95  27.65   \n",
       "127654              6.33              21.70             25.69  32.35   \n",
       "127655              1.76               4.65             17.81  37.42   \n",
       "127656             12.00               2.52              5.90  35.48   \n",
       "127657             10.85               6.46             14.46  34.14   \n",
       "\n",
       "        Several times daily  Hourly or more  importance_average  \\\n",
       "0                      4.91            2.63                4.52   \n",
       "1                      4.91            2.63                4.52   \n",
       "2                      4.91            2.63                4.52   \n",
       "3                      4.91            2.63                4.52   \n",
       "4                      4.91            2.63                4.52   \n",
       "...                     ...             ...                 ...   \n",
       "127653                 7.93            8.34                4.08   \n",
       "127654                12.47            0.00                4.02   \n",
       "127655                23.31           10.55                3.88   \n",
       "127656                22.08           15.05                3.87   \n",
       "127657                16.39           11.78                3.53   \n",
       "\n",
       "        relevance_average  remote OCC_CODE   TOT_EMP  H_MEAN  A_MEAN  \n",
       "0                   74.44  remote  11-1011  211230.0  124.47  258900  \n",
       "1                   74.44  remote  11-1011  211230.0  124.47  258900  \n",
       "2                   74.44  remote  11-1011  211230.0  124.47  258900  \n",
       "3                   74.44  remote  11-1011  211230.0  124.47  258900  \n",
       "4                   74.44  remote  11-1011  211230.0  124.47  258900  \n",
       "...                   ...     ...      ...       ...     ...     ...  \n",
       "127653              64.04  remote  53-7121   11400.0    29.1   60530  \n",
       "127654              44.33  remote  53-7121   11400.0    29.1   60530  \n",
       "127655              65.00  remote  53-7121   11400.0    29.1   60530  \n",
       "127656              47.90  remote  53-7121   11400.0    29.1   60530  \n",
       "127657              47.84  remote  53-7121   11400.0    29.1   60530  \n",
       "\n",
       "[127658 rows x 19 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cross-reference woth BLS OEWS\n",
    "# It doesn't really make sens to have it per-task, we only need it per-occupation...\n",
    "df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
    "df_merged['occ_code_join'] = df_merged['onetsoc_code'].str[:7]\n",
    "df_merged = pd.merge(\n",
    "    df_merged,\n",
    "    df_oesm_detailed,\n",
    "    left_on='occ_code_join',\n",
    "    right_on='OCC_CODE',\n",
    "    how='left'\n",
    ")\n",
    "df_merged = df_merged.drop(columns=['occ_code_join'])\n",
    "df_merged"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "08f45d91-039d-4ec0-94a2-f305a3312e6a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Why did the scarecrow win an award?\n",
      "\n",
      "Because he was outstanding in his field!\n"
     ]
    }
   ],
   "source": [
    "response = oai.chat.completions.create(messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}], model=\"gpt-4.1-2025-04-14\", max_tokens=100, temperature=0.7, n=1, stop=None)\n",
    "joke = response.choices[0].message.content.strip()\n",
    "print(joke)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/create_onet_database.sh
+++ b/create_onet_database.sh
@ -0,0 +1,85 @@
 #!/usr/bin/env bash
 # Set database name and directories
 ONET_DB_NAME="onet.database"
 ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
 ONET_ZIP_FILE="db_29_1_mysql.zip"
 ONET_EXTRACT_DIR="db_29_1_mysql"
 # Download O*NET database only if not already downloaded
 if [ ! -f "$ONET_ZIP_FILE" ]; then
    echo "Downloading O*NET database from $ONET_ZIP_URL"
    curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
    if [ $? -ne 0 ]; then
        echo "Failed to download O*NET database"
        exit 1
    fi
 else
    echo "Using existing O*NET database zip file"
 fi
 # Extract downloaded zip file only if extraction directory doesn't exist
 if [ ! -d "$ONET_EXTRACT_DIR" ]; then
    echo "Extracting O*NET database files"
    unzip -o "$ONET_ZIP_FILE"
    if [ $? -ne 0 ]; then
        echo "Failed to extract O*NET database files"
        exit 1
    fi
 else
    echo "Using existing extracted O*NET database files"
 fi
 # Remove existing database if it exists
 if [ -f "$ONET_DB_NAME" ]; then
    echo "Removing existing database"
    rm "$ONET_DB_NAME"
 fi
 # Create a new SQLite database with optimized settings for fast import
 echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
 sqlite3 "$ONET_DB_NAME" << EOF
 PRAGMA journal_mode = OFF;
 PRAGMA synchronous = 0;
 PRAGMA cache_size = 1000000;
 PRAGMA locking_mode = EXCLUSIVE;
 PRAGMA temp_store = MEMORY;
 PRAGMA foreign_keys = ON;
 EOF
 # Combine and execute all SQL files in one transaction
 echo "Executing SQL files in alphabetical order (single transaction mode)"
 sqlite3 "$ONET_DB_NAME" << EOF
 BEGIN TRANSACTION;
 $(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
 COMMIT;
 EOF
 # Check if the execution was successful
 if [ $? -ne 0 ]; then
    echo "Error executing SQL files in batch transaction"
    exit 1
 else
    echo "Database populated successfully. Restoring reliability settings..."
    # Restore reliability-focused settings after import
    sqlite3 "$ONET_DB_NAME" << EOF
 PRAGMA journal_mode = WAL;
 PRAGMA synchronous = NORMAL;
 PRAGMA locking_mode = NORMAL;
 PRAGMA temp_store = DEFAULT;
 PRAGMA foreign_keys = ON;
 PRAGMA optimize;
 VACUUM;
 EOF
    if [ $? -ne 0 ]; then
        echo "Warning: Failed to restore reliability settings, but database is populated"
    else
        echo "Reliability settings restored successfully"
    fi
    echo "O*NET database created and optimized successfully!"
 fi
--- a/enrich_task_ratings.py
+++ b/enrich_task_ratings.py
@ -0,0 +1,223 @@
 import sqlite3
 import pandas as pd
 import json
 import os
 from collections import defaultdict
 import numpy as np  # Import numpy for nan handling if necessary
 # --- Configuration ---
 DB_FILE = "onet.database"
 OUTPUT_FILE = "task_ratings_enriched.json"
 # --- Database Interaction ---
 def fetch_data_from_db(db_path):
    """
    Fetches required data from the O*NET SQLite database using JOINs.
    Args:
        db_path (str): Path to the SQLite database file.
    Returns:
        pandas.DataFrame: DataFrame containing joined data from task_ratings,
                          task_statements, and occupation_data.
        Returns None if the database file doesn't exist or an error occurs.
    """
    if not os.path.exists(db_path):
        print(f"Error: Database file not found at {db_path}")
        return None
    try:
        conn = sqlite3.connect(db_path)
        # Construct the SQL query to join the tables and select necessary columns
        # We select all relevant columns needed for processing.
        query = """
        SELECT
            tr.onetsoc_code,
            tr.task_id,
            ts.task,
            od.title AS occupation_title,
            od.description AS occupation_description,
            tr.scale_id,
            tr.category,
            tr.data_value
        FROM
            task_ratings tr
        JOIN
            task_statements ts ON tr.task_id = ts.task_id
        JOIN
            occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
        """
        df = pd.read_sql_query(query, conn)
        conn.close()
        print(f"Successfully fetched {len(df)} records from the database.")
        return df
    except sqlite3.Error as e:
        print(f"SQLite error: {e}")
        if conn:
            conn.close()
        return None
    except Exception as e:
        print(f"An error occurred during data fetching: {e}")
        if "conn" in locals() and conn:
            conn.close()
        return None
 # --- Data Processing ---
 def process_task_ratings(df):
    """
    Processes the fetched data to group, pivot frequency, calculate averages,
    and structure the output.
    Args:
        df (pandas.DataFrame): The input DataFrame with joined data.
    Returns:
        list: A list of dictionaries, each representing an enriched task rating.
              Returns None if the input DataFrame is invalid.
    """
    if df is None or df.empty:
        print("Error: Input DataFrame is empty or invalid.")
        return None
    print("Starting data processing...")
    # --- 1. Handle Frequency (FT) ---
    # Filter for Frequency ratings
    freq_df = df[df["scale_id"] == "FT"].copy()
    # Pivot the frequency data: index by task and occupation, columns by category
    # We fill missing frequency values with 0, assuming no rating means 0% for that category.
    freq_pivot = freq_df.pivot_table(
        index=["onetsoc_code", "task_id"],
        columns="category",
        values="data_value",
        fill_value=0,  # Fill missing categories for a task/occupation with 0
    )
    # Rename columns for clarity using the requested format
    freq_pivot.columns = [
        f"frequency_category_{int(col)}" for col in freq_pivot.columns
    ]  # <-- UPDATED LINE
    print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
    # --- 2. Handle Importance (IM, IJ) ---
    # Filter for Importance ratings
    imp_df = df[df["scale_id"].isin(["IM", "IJ"])].copy()
    # Group by task and occupation, calculate the mean importance
    # Using np.nanmean to handle potential NaN values gracefully if any exist
    imp_avg = (
        imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
    )
    imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
    print(f"Processed Importance data. Shape: {imp_avg.shape}")
    # --- 3. Handle Relevance (RT) ---
    # Filter for Relevance ratings
    rel_df = df[df["scale_id"] == "RT"].copy()
    # Group by task and occupation, calculate the mean relevance
    rel_avg = (
        rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
    )
    rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
    print(f"Processed Relevance data. Shape: {rel_avg.shape}")
    # --- 4. Get Base Task/Occupation Info ---
    # Select unique combinations of task and occupation details
    base_info = (
        df[
            [
                "onetsoc_code",
                "task_id",
                "task",
                "occupation_title",
                "occupation_description",
            ]
        ]
        .drop_duplicates()
        .set_index(["onetsoc_code", "task_id"])
    )
    print(f"Extracted base info. Shape: {base_info.shape}")
    # --- 5. Merge Processed Data ---
    # Start with the base info and merge the calculated/pivoted data
    # Use 'left' joins to ensure all tasks/occupations from the base_info are kept.
    # If a task/occupation doesn't have frequency, importance, or relevance ratings,
    # the corresponding columns will have NaN values after the merge.
    print("Merging processed data...")
    final_df = base_info.merge(
        freq_pivot, left_index=True, right_index=True, how="left"
    )
    # Set index before merging averages which are not multi-indexed
    final_df = final_df.reset_index()
    final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
    final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
    # Fill potential NaN values resulting from left joins if needed.
    # For averages, NaN might mean no rating was provided. We can leave them as NaN
    # or fill with 0 or another placeholder depending on desired interpretation.
    # For frequency categories, NaN could mean that category wasn't rated. We filled with 0 during pivot.
    # Example: Fill NaN averages with 0
    # final_df['importance_average'].fillna(0, inplace=True)
    # final_df['relevance_average'].fillna(0, inplace=True)
    # Note: Leaving NaNs might be more informative.
    print(f"Final merged data shape: {final_df.shape}")
    # Convert DataFrame to list of dictionaries for JSON output
    # Handle potential NaN values during JSON conversion
    final_df = final_df.replace(
        {np.nan: None}
    )  # Replace numpy NaN with Python None for JSON compatibility
    result_list = final_df.to_dict(orient="records")
    return result_list
 # --- Output ---
 def write_to_json(data, output_path):
    """
    Writes the processed data to a JSON file.
    Args:
        data (list): The list of dictionaries to write.
        output_path (str): Path to the output JSON file.
    """
    if data is None:
        print("No data to write to JSON.")
        return
    try:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        print(f"Successfully wrote enriched data to {output_path}")
    except IOError as e:
        print(f"Error writing JSON file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during JSON writing: {e}")
 # --- Main Execution ---
 if __name__ == "__main__":
    print("Starting O*NET Task Ratings Enrichment Script...")
    # 1. Fetch data
    raw_data_df = fetch_data_from_db(DB_FILE)
    # 2. Process data
    if raw_data_df is not None:
        enriched_data = process_task_ratings(raw_data_df)
        # 3. Write output
        if enriched_data:
            write_to_json(enriched_data, OUTPUT_FILE)
        else:
            print("Data processing failed. No output file generated.")
    else:
        print("Data fetching failed. Script terminated.")
    print("Script finished.")
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,99 @@
 {
  "nodes": {
    "nixpkgs": {
      "locked": {
        "lastModified": 1745526057,
        "narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=",
        "owner": "NixOS",
        "repo": "nixpkgs",
        "rev": "f771eb401a46846c1aebd20552521b233dd7e18b",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "pyproject-build-systems": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
        ],
        "pyproject-nix": [
          "pyproject-nix"
        ],
        "uv2nix": [
          "uv2nix"
        ]
      },
      "locked": {
        "lastModified": 1744599653,
        "narHash": "sha256-nysSwVVjG4hKoOjhjvE6U5lIKA8sEr1d1QzEfZsannU=",
        "owner": "pyproject-nix",
        "repo": "build-system-pkgs",
        "rev": "7dba6dbc73120e15b558754c26024f6c93015dd7",
        "type": "github"
      },
      "original": {
        "owner": "pyproject-nix",
        "repo": "build-system-pkgs",
        "type": "github"
      }
    },
    "pyproject-nix": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
        ]
      },
      "locked": {
        "lastModified": 1743438845,
        "narHash": "sha256-1GSaoubGtvsLRwoYwHjeKYq40tLwvuFFVhGrG8J9Oek=",
        "owner": "pyproject-nix",
        "repo": "pyproject.nix",
        "rev": "8063ec98edc459571d042a640b1c5e334ecfca1e",
        "type": "github"
      },
      "original": {
        "owner": "pyproject-nix",
        "repo": "pyproject.nix",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "nixpkgs": "nixpkgs",
        "pyproject-build-systems": "pyproject-build-systems",
        "pyproject-nix": "pyproject-nix",
        "uv2nix": "uv2nix"
      }
    },
    "uv2nix": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
        ],
        "pyproject-nix": [
          "pyproject-nix"
        ]
      },
      "locked": {
        "lastModified": 1745328266,
        "narHash": "sha256-ykgcOadiU9Z67P2MOjB0r06r35cQu65t0fzDeYR1uzc=",
        "owner": "pyproject-nix",
        "repo": "uv2nix",
        "rev": "bcadc56a1e90d89bf32cc4ac308d8252e2adf855",
        "type": "github"
      },
      "original": {
        "owner": "pyproject-nix",
        "repo": "uv2nix",
        "type": "github"
      }
    }
  },
  "root": "root",
  "version": 7
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,31 @@
 [project]
 name = "sprint-econtai"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "dotenv>=0.9.9",
    "jupyter>=1.1.1",
    "notebook>=7.4.1",
    "openai>=1.76.0",
    "openpyxl>=3.1.5",
    "pandas>=2.2.3",
    "requests>=2.32.3",
    "tqdm>=4.67.1",
 ]
 [tool.pytest.ini_options]
 pythonpath="src"
 addopts="-v"
 asyncio_mode = "auto"
 [tool.black]
 line-length = 100
 [tool.isort]
 profile = "black"
 [dependency-groups]
 dev = []
--- a/uv.lock
+++ b/uv.lock