This commit is contained in:
Félix Dorn 2025-04-26 23:38:19 +02:00
parent 8c0b53a32c
commit 19bf2e6b18
9 changed files with 2675 additions and 1 deletions

1
.env.example Normal file
View file

@ -0,0 +1 @@
export OPENAI_API_KEY=

2
.envrc
View file

@ -1 +1 @@
use flake
use flake .#impure

8
.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
epoch_task_data.csv
oesm23national.xlsx
onet.database*
onet_occupation_data.json
schema.sql
task_ratings_enriched.json
.env
.ipynb_checkpoints

641
Untitled.ipynb Normal file
View file

@ -0,0 +1,641 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 86,
"id": "beace815-b5ae-44a4-a81c-a7f82cb66296",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[2mResolved \u001b[1m118 packages\u001b[0m \u001b[2min 386ms\u001b[0m\u001b[0m \u001b[0m\n",
"\u001b[2K\u001b[2mPrepared \u001b[1m2 packages\u001b[0m \u001b[2min 124ms\u001b[0m\u001b[0m \n",
"\u001b[2K\u001b[2mInstalled \u001b[1m2 packages\u001b[0m \u001b[2min 5ms\u001b[0m\u001b[0m \u001b[0m\n",
" \u001b[32m+\u001b[39m \u001b[1met-xmlfile\u001b[0m\u001b[2m==2.0.0\u001b[0m\n",
" \u001b[32m+\u001b[39m \u001b[1mopenpyxl\u001b[0m\u001b[2m==3.1.5\u001b[0m\n"
]
}
],
"source": [
"!uv add pandas requests openai dotenv openpyxl"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "941d511f-ad72-4306-bbab-52127583e513",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import dotenv\n",
"import openai\n",
"import sqlite3\n",
"import pandas as pd\n",
"\n",
"dotenv.load_dotenv() # Copy .env.example to .env and fill in the blanks\n",
"\n",
"oai_token = os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"oai = openai.OpenAI(api_key=oai_token)\n",
"onet = sqlite3.connect(\"onet.database\") # Run ./create_onet_database.sh to create it\n",
"# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work\n",
"# It contains labels for whethere a O*NET task can be done remotely or not (labeled by GPT-4o)\n",
"# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing\n",
"df_remote_status = pd.read_csv(\"epoch_task_data.csv\")\n",
"\n",
"# BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip\n",
"df_oesm = pd.read_excel(\"oesm23national.xlsx\")\n",
"\n",
"# Run uv run enrich_task_ratings.py to get this file (trs = Task RatingS)\n",
"df_enriched_trs = pd.read_json(\"task_ratings_enriched.json\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a5351f8b-c2ad-4d3e-af4a-992f539a6064",
"metadata": {},
"outputs": [],
"source": [
"FREQUENCY_MAP = {\n",
" 'frequency_category_1': \"Yearly or less\",\n",
" 'frequency_category_2': \"More than yearly\",\n",
" 'frequency_category_3': \"More than monthly\",\n",
" 'frequency_category_4': \"More than weekly\",\n",
" 'frequency_category_5': \"Daily\",\n",
" 'frequency_category_6': \"Several times daily\",\n",
" 'frequency_category_7': \"Hourly or more\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8b2ab22a-afab-41f9-81a3-48eab261b568",
"metadata": {},
"outputs": [],
"source": [
"background_prompt = '''\n",
"Estimate the typical duration to complete *one instance* of the following job task from the moment a person starts to work on it to the last moment the person will need to keep it in mind\n",
"\n",
"Take into account that there might be delays between the steps to complete the task, which would lengthen the estimate.\n",
"\n",
"Output a range with the format [duration A] - [duration B] where [duration A] and [duration B] correspond to one of the durations below:\n",
"- less than 30 minutes\n",
"- 30 minutes\n",
"- 1 hour\n",
"- 4 hours\n",
"- 8 hours\n",
"- 16 hours\n",
"- 3 days\n",
"- 1 week\n",
"- 3 weeks\n",
"- 6 weeks\n",
"- 3 months\n",
"- 6 months\n",
"- 1 year\n",
"- 3 years\n",
"- more than 3 year\n",
"\n",
"**Do not output anything besides the range**\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d2e4a855-f327-4b3d-ad0b-ed997e720639",
"metadata": {},
"outputs": [],
"source": [
"df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
"df_enriched_trs['occ_code_join'] = df_enriched_trs['onetsoc_code'].str[:7]\n",
"df_merged = pd.merge(\n",
" df_enriched_trs,\n",
" df_oesm_detailed,\n",
" left_on='occ_code_join',\n",
" right_on='OCC_CODE',\n",
" how='left'\n",
")\n",
"df_merged = df_merged.drop(columns=['occ_code_join'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9be7acb5-2374-4f61-bba3-13b0077c0bd2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Task: Develop or recommend network security measures, such as firewalls, network security audits, or automated security probes.\n",
"Occupation Description: Design and implement computer and information networks, such as local area networks (LAN), wide area networks (WAN), intranets, extranets, and other data communications networks. Perform network modeling, analysis, and planning, including analysis of capacity needs for network infrastructures. May also design network and computer security measures. May research and recommend network and data communications hardware and software.\n",
"Occupation Title: Computer Network Architects\n"
]
},
{
"data": {
"text/plain": [
"onetsoc_code 15-1241.00\n",
"task_id 18971\n",
"task Develop or recommend network security measures...\n",
"occupation_title Computer Network Architects\n",
"occupation_description Design and implement computer and information ...\n",
"Yearly or less 0.0\n",
"More than yearly 30.0\n",
"More than monthly 15.0\n",
"More than weekly 20.0\n",
"Daily 15.0\n",
"Several times daily 15.0\n",
"Hourly or more 5.0\n",
"importance_average 4.35\n",
"relevance_average 100.0\n",
"occ_code_join 15-1241\n",
"remote remote\n",
"Name: 45200, dtype: object"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"df_merged = pd \\\n",
" .merge(left=df_enriched_trs, right=df_remote_status[['O*NET-SOC Code', 'Remote']], how='left', left_on='onetsoc_code', right_on='O*NET-SOC Code') \\\n",
" .drop(columns=['O*NET-SOC Code']) \\\n",
" .rename(columns={'Remote': 'remote'}) \\\n",
" .rename(columns=FREQUENCY_MAP) \\\n",
" .query('remote == \"remote\" and importance_average >= 3')\n",
"\n",
"row = df_merged.iloc[30000]\n",
"print('Task: ', row['task'])\n",
"print('Occupation Description: ', row['occupation_description'])\n",
"print('Occupation Title: ', row['occupation_title'])\n",
"\n",
"row"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9e5ea89f-2c18-459d-851d-dacb379f4a2e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>onetsoc_code</th>\n",
" <th>task_id</th>\n",
" <th>task</th>\n",
" <th>occupation_title</th>\n",
" <th>occupation_description</th>\n",
" <th>Yearly or less</th>\n",
" <th>More than yearly</th>\n",
" <th>More than monthly</th>\n",
" <th>More than weekly</th>\n",
" <th>Daily</th>\n",
" <th>Several times daily</th>\n",
" <th>Hourly or more</th>\n",
" <th>importance_average</th>\n",
" <th>relevance_average</th>\n",
" <th>remote</th>\n",
" <th>OCC_CODE</th>\n",
" <th>TOT_EMP</th>\n",
" <th>H_MEAN</th>\n",
" <th>A_MEAN</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11-1011.00</td>\n",
" <td>8823</td>\n",
" <td>Direct or coordinate an organization's financi...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>5.92</td>\n",
" <td>15.98</td>\n",
" <td>29.68</td>\n",
" <td>21.18</td>\n",
" <td>19.71</td>\n",
" <td>4.91</td>\n",
" <td>2.63</td>\n",
" <td>4.52</td>\n",
" <td>74.44</td>\n",
" <td>remote</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11-1011.00</td>\n",
" <td>8823</td>\n",
" <td>Direct or coordinate an organization's financi...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>5.92</td>\n",
" <td>15.98</td>\n",
" <td>29.68</td>\n",
" <td>21.18</td>\n",
" <td>19.71</td>\n",
" <td>4.91</td>\n",
" <td>2.63</td>\n",
" <td>4.52</td>\n",
" <td>74.44</td>\n",
" <td>remote</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11-1011.00</td>\n",
" <td>8823</td>\n",
" <td>Direct or coordinate an organization's financi...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>5.92</td>\n",
" <td>15.98</td>\n",
" <td>29.68</td>\n",
" <td>21.18</td>\n",
" <td>19.71</td>\n",
" <td>4.91</td>\n",
" <td>2.63</td>\n",
" <td>4.52</td>\n",
" <td>74.44</td>\n",
" <td>remote</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11-1011.00</td>\n",
" <td>8823</td>\n",
" <td>Direct or coordinate an organization's financi...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>5.92</td>\n",
" <td>15.98</td>\n",
" <td>29.68</td>\n",
" <td>21.18</td>\n",
" <td>19.71</td>\n",
" <td>4.91</td>\n",
" <td>2.63</td>\n",
" <td>4.52</td>\n",
" <td>74.44</td>\n",
" <td>remote</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11-1011.00</td>\n",
" <td>8823</td>\n",
" <td>Direct or coordinate an organization's financi...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>5.92</td>\n",
" <td>15.98</td>\n",
" <td>29.68</td>\n",
" <td>21.18</td>\n",
" <td>19.71</td>\n",
" <td>4.91</td>\n",
" <td>2.63</td>\n",
" <td>4.52</td>\n",
" <td>74.44</td>\n",
" <td>remote</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127653</th>\n",
" <td>53-7121.00</td>\n",
" <td>12807</td>\n",
" <td>Unload cars containing liquids by connecting h...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>6.05</td>\n",
" <td>29.21</td>\n",
" <td>6.88</td>\n",
" <td>13.95</td>\n",
" <td>27.65</td>\n",
" <td>7.93</td>\n",
" <td>8.34</td>\n",
" <td>4.08</td>\n",
" <td>64.04</td>\n",
" <td>remote</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127654</th>\n",
" <td>53-7121.00</td>\n",
" <td>12804</td>\n",
" <td>Clean interiors of tank cars or tank trucks, u...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>1.47</td>\n",
" <td>6.33</td>\n",
" <td>21.70</td>\n",
" <td>25.69</td>\n",
" <td>32.35</td>\n",
" <td>12.47</td>\n",
" <td>0.00</td>\n",
" <td>4.02</td>\n",
" <td>44.33</td>\n",
" <td>remote</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127655</th>\n",
" <td>53-7121.00</td>\n",
" <td>12803</td>\n",
" <td>Lower gauge rods into tanks or read meters to ...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>4.52</td>\n",
" <td>1.76</td>\n",
" <td>4.65</td>\n",
" <td>17.81</td>\n",
" <td>37.42</td>\n",
" <td>23.31</td>\n",
" <td>10.55</td>\n",
" <td>3.88</td>\n",
" <td>65.00</td>\n",
" <td>remote</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127656</th>\n",
" <td>53-7121.00</td>\n",
" <td>12805</td>\n",
" <td>Operate conveyors and equipment to transfer gr...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>6.97</td>\n",
" <td>12.00</td>\n",
" <td>2.52</td>\n",
" <td>5.90</td>\n",
" <td>35.48</td>\n",
" <td>22.08</td>\n",
" <td>15.05</td>\n",
" <td>3.87</td>\n",
" <td>47.90</td>\n",
" <td>remote</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127657</th>\n",
" <td>53-7121.00</td>\n",
" <td>12810</td>\n",
" <td>Perform general warehouse activities, such as ...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>5.91</td>\n",
" <td>10.85</td>\n",
" <td>6.46</td>\n",
" <td>14.46</td>\n",
" <td>34.14</td>\n",
" <td>16.39</td>\n",
" <td>11.78</td>\n",
" <td>3.53</td>\n",
" <td>47.84</td>\n",
" <td>remote</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>127658 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [
" onetsoc_code task_id \\\n",
"0 11-1011.00 8823 \n",
"1 11-1011.00 8823 \n",
"2 11-1011.00 8823 \n",
"3 11-1011.00 8823 \n",
"4 11-1011.00 8823 \n",
"... ... ... \n",
"127653 53-7121.00 12807 \n",
"127654 53-7121.00 12804 \n",
"127655 53-7121.00 12803 \n",
"127656 53-7121.00 12805 \n",
"127657 53-7121.00 12810 \n",
"\n",
" task \\\n",
"0 Direct or coordinate an organization's financi... \n",
"1 Direct or coordinate an organization's financi... \n",
"2 Direct or coordinate an organization's financi... \n",
"3 Direct or coordinate an organization's financi... \n",
"4 Direct or coordinate an organization's financi... \n",
"... ... \n",
"127653 Unload cars containing liquids by connecting h... \n",
"127654 Clean interiors of tank cars or tank trucks, u... \n",
"127655 Lower gauge rods into tanks or read meters to ... \n",
"127656 Operate conveyors and equipment to transfer gr... \n",
"127657 Perform general warehouse activities, such as ... \n",
"\n",
" occupation_title \\\n",
"0 Chief Executives \n",
"1 Chief Executives \n",
"2 Chief Executives \n",
"3 Chief Executives \n",
"4 Chief Executives \n",
"... ... \n",
"127653 Tank Car, Truck, and Ship Loaders \n",
"127654 Tank Car, Truck, and Ship Loaders \n",
"127655 Tank Car, Truck, and Ship Loaders \n",
"127656 Tank Car, Truck, and Ship Loaders \n",
"127657 Tank Car, Truck, and Ship Loaders \n",
"\n",
" occupation_description Yearly or less \\\n",
"0 Determine and formulate policies and provide o... 5.92 \n",
"1 Determine and formulate policies and provide o... 5.92 \n",
"2 Determine and formulate policies and provide o... 5.92 \n",
"3 Determine and formulate policies and provide o... 5.92 \n",
"4 Determine and formulate policies and provide o... 5.92 \n",
"... ... ... \n",
"127653 Load and unload chemicals and bulk solids, suc... 6.05 \n",
"127654 Load and unload chemicals and bulk solids, suc... 1.47 \n",
"127655 Load and unload chemicals and bulk solids, suc... 4.52 \n",
"127656 Load and unload chemicals and bulk solids, suc... 6.97 \n",
"127657 Load and unload chemicals and bulk solids, suc... 5.91 \n",
"\n",
" More than yearly More than monthly More than weekly Daily \\\n",
"0 15.98 29.68 21.18 19.71 \n",
"1 15.98 29.68 21.18 19.71 \n",
"2 15.98 29.68 21.18 19.71 \n",
"3 15.98 29.68 21.18 19.71 \n",
"4 15.98 29.68 21.18 19.71 \n",
"... ... ... ... ... \n",
"127653 29.21 6.88 13.95 27.65 \n",
"127654 6.33 21.70 25.69 32.35 \n",
"127655 1.76 4.65 17.81 37.42 \n",
"127656 12.00 2.52 5.90 35.48 \n",
"127657 10.85 6.46 14.46 34.14 \n",
"\n",
" Several times daily Hourly or more importance_average \\\n",
"0 4.91 2.63 4.52 \n",
"1 4.91 2.63 4.52 \n",
"2 4.91 2.63 4.52 \n",
"3 4.91 2.63 4.52 \n",
"4 4.91 2.63 4.52 \n",
"... ... ... ... \n",
"127653 7.93 8.34 4.08 \n",
"127654 12.47 0.00 4.02 \n",
"127655 23.31 10.55 3.88 \n",
"127656 22.08 15.05 3.87 \n",
"127657 16.39 11.78 3.53 \n",
"\n",
" relevance_average remote OCC_CODE TOT_EMP H_MEAN A_MEAN \n",
"0 74.44 remote 11-1011 211230.0 124.47 258900 \n",
"1 74.44 remote 11-1011 211230.0 124.47 258900 \n",
"2 74.44 remote 11-1011 211230.0 124.47 258900 \n",
"3 74.44 remote 11-1011 211230.0 124.47 258900 \n",
"4 74.44 remote 11-1011 211230.0 124.47 258900 \n",
"... ... ... ... ... ... ... \n",
"127653 64.04 remote 53-7121 11400.0 29.1 60530 \n",
"127654 44.33 remote 53-7121 11400.0 29.1 60530 \n",
"127655 65.00 remote 53-7121 11400.0 29.1 60530 \n",
"127656 47.90 remote 53-7121 11400.0 29.1 60530 \n",
"127657 47.84 remote 53-7121 11400.0 29.1 60530 \n",
"\n",
"[127658 rows x 19 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cross-reference woth BLS OEWS\n",
"# It doesn't really make sens to have it per-task, we only need it per-occupation...\n",
"df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
"df_merged['occ_code_join'] = df_merged['onetsoc_code'].str[:7]\n",
"df_merged = pd.merge(\n",
" df_merged,\n",
" df_oesm_detailed,\n",
" left_on='occ_code_join',\n",
" right_on='OCC_CODE',\n",
" how='left'\n",
")\n",
"df_merged = df_merged.drop(columns=['occ_code_join'])\n",
"df_merged"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "08f45d91-039d-4ec0-94a2-f305a3312e6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Why did the scarecrow win an award?\n",
"\n",
"Because he was outstanding in his field!\n"
]
}
],
"source": [
"response = oai.chat.completions.create(messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}], model=\"gpt-4.1-2025-04-14\", max_tokens=100, temperature=0.7, n=1, stop=None)\n",
"joke = response.choices[0].message.content.strip()\n",
"print(joke)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

85
create_onet_database.sh Executable file
View file

@ -0,0 +1,85 @@
#!/usr/bin/env bash
# Set database name and directories
ONET_DB_NAME="onet.database"
ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
ONET_ZIP_FILE="db_29_1_mysql.zip"
ONET_EXTRACT_DIR="db_29_1_mysql"
# Download O*NET database only if not already downloaded
if [ ! -f "$ONET_ZIP_FILE" ]; then
echo "Downloading O*NET database from $ONET_ZIP_URL"
curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
if [ $? -ne 0 ]; then
echo "Failed to download O*NET database"
exit 1
fi
else
echo "Using existing O*NET database zip file"
fi
# Extract downloaded zip file only if extraction directory doesn't exist
if [ ! -d "$ONET_EXTRACT_DIR" ]; then
echo "Extracting O*NET database files"
unzip -o "$ONET_ZIP_FILE"
if [ $? -ne 0 ]; then
echo "Failed to extract O*NET database files"
exit 1
fi
else
echo "Using existing extracted O*NET database files"
fi
# Remove existing database if it exists
if [ -f "$ONET_DB_NAME" ]; then
echo "Removing existing database"
rm "$ONET_DB_NAME"
fi
# Create a new SQLite database with optimized settings for fast import
echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
sqlite3 "$ONET_DB_NAME" << EOF
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
PRAGMA foreign_keys = ON;
EOF
# Combine and execute all SQL files in one transaction
echo "Executing SQL files in alphabetical order (single transaction mode)"
sqlite3 "$ONET_DB_NAME" << EOF
BEGIN TRANSACTION;
$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
COMMIT;
EOF
# Check if the execution was successful
if [ $? -ne 0 ]; then
echo "Error executing SQL files in batch transaction"
exit 1
else
echo "Database populated successfully. Restoring reliability settings..."
# Restore reliability-focused settings after import
sqlite3 "$ONET_DB_NAME" << EOF
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA locking_mode = NORMAL;
PRAGMA temp_store = DEFAULT;
PRAGMA foreign_keys = ON;
PRAGMA optimize;
VACUUM;
EOF
if [ $? -ne 0 ]; then
echo "Warning: Failed to restore reliability settings, but database is populated"
else
echo "Reliability settings restored successfully"
fi
echo "O*NET database created and optimized successfully!"
fi

223
enrich_task_ratings.py Normal file
View file

@ -0,0 +1,223 @@
import sqlite3
import pandas as pd
import json
import os
from collections import defaultdict
import numpy as np # Import numpy for nan handling if necessary
# --- Configuration ---
DB_FILE = "onet.database"
OUTPUT_FILE = "task_ratings_enriched.json"
# --- Database Interaction ---
def fetch_data_from_db(db_path):
"""
Fetches required data from the O*NET SQLite database using JOINs.
Args:
db_path (str): Path to the SQLite database file.
Returns:
pandas.DataFrame: DataFrame containing joined data from task_ratings,
task_statements, and occupation_data.
Returns None if the database file doesn't exist or an error occurs.
"""
if not os.path.exists(db_path):
print(f"Error: Database file not found at {db_path}")
return None
try:
conn = sqlite3.connect(db_path)
# Construct the SQL query to join the tables and select necessary columns
# We select all relevant columns needed for processing.
query = """
SELECT
tr.onetsoc_code,
tr.task_id,
ts.task,
od.title AS occupation_title,
od.description AS occupation_description,
tr.scale_id,
tr.category,
tr.data_value
FROM
task_ratings tr
JOIN
task_statements ts ON tr.task_id = ts.task_id
JOIN
occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
"""
df = pd.read_sql_query(query, conn)
conn.close()
print(f"Successfully fetched {len(df)} records from the database.")
return df
except sqlite3.Error as e:
print(f"SQLite error: {e}")
if conn:
conn.close()
return None
except Exception as e:
print(f"An error occurred during data fetching: {e}")
if "conn" in locals() and conn:
conn.close()
return None
# --- Data Processing ---
def process_task_ratings(df):
"""
Processes the fetched data to group, pivot frequency, calculate averages,
and structure the output.
Args:
df (pandas.DataFrame): The input DataFrame with joined data.
Returns:
list: A list of dictionaries, each representing an enriched task rating.
Returns None if the input DataFrame is invalid.
"""
if df is None or df.empty:
print("Error: Input DataFrame is empty or invalid.")
return None
print("Starting data processing...")
# --- 1. Handle Frequency (FT) ---
# Filter for Frequency ratings
freq_df = df[df["scale_id"] == "FT"].copy()
# Pivot the frequency data: index by task and occupation, columns by category
# We fill missing frequency values with 0, assuming no rating means 0% for that category.
freq_pivot = freq_df.pivot_table(
index=["onetsoc_code", "task_id"],
columns="category",
values="data_value",
fill_value=0, # Fill missing categories for a task/occupation with 0
)
# Rename columns for clarity using the requested format
freq_pivot.columns = [
f"frequency_category_{int(col)}" for col in freq_pivot.columns
] # <-- UPDATED LINE
print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
# --- 2. Handle Importance (IM, IJ) ---
# Filter for Importance ratings
imp_df = df[df["scale_id"].isin(["IM", "IJ"])].copy()
# Group by task and occupation, calculate the mean importance
# Using np.nanmean to handle potential NaN values gracefully if any exist
imp_avg = (
imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
)
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
print(f"Processed Importance data. Shape: {imp_avg.shape}")
# --- 3. Handle Relevance (RT) ---
# Filter for Relevance ratings
rel_df = df[df["scale_id"] == "RT"].copy()
# Group by task and occupation, calculate the mean relevance
rel_avg = (
rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
)
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
print(f"Processed Relevance data. Shape: {rel_avg.shape}")
# --- 4. Get Base Task/Occupation Info ---
# Select unique combinations of task and occupation details
base_info = (
df[
[
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
]
]
.drop_duplicates()
.set_index(["onetsoc_code", "task_id"])
)
print(f"Extracted base info. Shape: {base_info.shape}")
# --- 5. Merge Processed Data ---
# Start with the base info and merge the calculated/pivoted data
# Use 'left' joins to ensure all tasks/occupations from the base_info are kept.
# If a task/occupation doesn't have frequency, importance, or relevance ratings,
# the corresponding columns will have NaN values after the merge.
print("Merging processed data...")
final_df = base_info.merge(
freq_pivot, left_index=True, right_index=True, how="left"
)
# Set index before merging averages which are not multi-indexed
final_df = final_df.reset_index()
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
# Fill potential NaN values resulting from left joins if needed.
# For averages, NaN might mean no rating was provided. We can leave them as NaN
# or fill with 0 or another placeholder depending on desired interpretation.
# For frequency categories, NaN could mean that category wasn't rated. We filled with 0 during pivot.
# Example: Fill NaN averages with 0
# final_df['importance_average'].fillna(0, inplace=True)
# final_df['relevance_average'].fillna(0, inplace=True)
# Note: Leaving NaNs might be more informative.
print(f"Final merged data shape: {final_df.shape}")
# Convert DataFrame to list of dictionaries for JSON output
# Handle potential NaN values during JSON conversion
final_df = final_df.replace(
{np.nan: None}
) # Replace numpy NaN with Python None for JSON compatibility
result_list = final_df.to_dict(orient="records")
return result_list
# --- Output ---
def write_to_json(data, output_path):
"""
Writes the processed data to a JSON file.
Args:
data (list): The list of dictionaries to write.
output_path (str): Path to the output JSON file.
"""
if data is None:
print("No data to write to JSON.")
return
try:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Successfully wrote enriched data to {output_path}")
except IOError as e:
print(f"Error writing JSON file: {e}")
except Exception as e:
print(f"An unexpected error occurred during JSON writing: {e}")
# --- Main Execution ---
if __name__ == "__main__":
print("Starting O*NET Task Ratings Enrichment Script...")
# 1. Fetch data
raw_data_df = fetch_data_from_db(DB_FILE)
# 2. Process data
if raw_data_df is not None:
enriched_data = process_task_ratings(raw_data_df)
# 3. Write output
if enriched_data:
write_to_json(enriched_data, OUTPUT_FILE)
else:
print("Data processing failed. No output file generated.")
else:
print("Data fetching failed. Script terminated.")
print("Script finished.")

99
flake.lock generated Normal file
View file

@ -0,0 +1,99 @@
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1745526057,
"narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "f771eb401a46846c1aebd20552521b233dd7e18b",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"pyproject-build-systems": {
"inputs": {
"nixpkgs": [
"nixpkgs"
],
"pyproject-nix": [
"pyproject-nix"
],
"uv2nix": [
"uv2nix"
]
},
"locked": {
"lastModified": 1744599653,
"narHash": "sha256-nysSwVVjG4hKoOjhjvE6U5lIKA8sEr1d1QzEfZsannU=",
"owner": "pyproject-nix",
"repo": "build-system-pkgs",
"rev": "7dba6dbc73120e15b558754c26024f6c93015dd7",
"type": "github"
},
"original": {
"owner": "pyproject-nix",
"repo": "build-system-pkgs",
"type": "github"
}
},
"pyproject-nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
]
},
"locked": {
"lastModified": 1743438845,
"narHash": "sha256-1GSaoubGtvsLRwoYwHjeKYq40tLwvuFFVhGrG8J9Oek=",
"owner": "pyproject-nix",
"repo": "pyproject.nix",
"rev": "8063ec98edc459571d042a640b1c5e334ecfca1e",
"type": "github"
},
"original": {
"owner": "pyproject-nix",
"repo": "pyproject.nix",
"type": "github"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs",
"pyproject-build-systems": "pyproject-build-systems",
"pyproject-nix": "pyproject-nix",
"uv2nix": "uv2nix"
}
},
"uv2nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
],
"pyproject-nix": [
"pyproject-nix"
]
},
"locked": {
"lastModified": 1745328266,
"narHash": "sha256-ykgcOadiU9Z67P2MOjB0r06r35cQu65t0fzDeYR1uzc=",
"owner": "pyproject-nix",
"repo": "uv2nix",
"rev": "bcadc56a1e90d89bf32cc4ac308d8252e2adf855",
"type": "github"
},
"original": {
"owner": "pyproject-nix",
"repo": "uv2nix",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

31
pyproject.toml Normal file
View file

@ -0,0 +1,31 @@
[project]
name = "sprint-econtai"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"dotenv>=0.9.9",
"jupyter>=1.1.1",
"notebook>=7.4.1",
"openai>=1.76.0",
"openpyxl>=3.1.5",
"pandas>=2.2.3",
"requests>=2.32.3",
"tqdm>=4.67.1",
]
[tool.pytest.ini_options]
pythonpath="src"
addopts="-v"
asyncio_mode = "auto"
[tool.black]
line-length = 100
[tool.isort]
profile = "black"
[dependency-groups]
dev = []

1586
uv.lock generated Normal file

File diff suppressed because it is too large Load diff