progress
This commit is contained in:
parent
8c0b53a32c
commit
19bf2e6b18
9 changed files with 2675 additions and 1 deletions
1
.env.example
Normal file
1
.env.example
Normal file
|
@ -0,0 +1 @@
|
||||||
|
export OPENAI_API_KEY=
|
2
.envrc
2
.envrc
|
@ -1 +1 @@
|
||||||
use flake
|
use flake .#impure
|
||||||
|
|
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
epoch_task_data.csv
|
||||||
|
oesm23national.xlsx
|
||||||
|
onet.database*
|
||||||
|
onet_occupation_data.json
|
||||||
|
schema.sql
|
||||||
|
task_ratings_enriched.json
|
||||||
|
.env
|
||||||
|
.ipynb_checkpoints
|
641
Untitled.ipynb
Normal file
641
Untitled.ipynb
Normal file
|
@ -0,0 +1,641 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 86,
|
||||||
|
"id": "beace815-b5ae-44a4-a81c-a7f82cb66296",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[2K\u001b[2mResolved \u001b[1m118 packages\u001b[0m \u001b[2min 386ms\u001b[0m\u001b[0m \u001b[0m\n",
|
||||||
|
"\u001b[2K\u001b[2mPrepared \u001b[1m2 packages\u001b[0m \u001b[2min 124ms\u001b[0m\u001b[0m \n",
|
||||||
|
"\u001b[2K\u001b[2mInstalled \u001b[1m2 packages\u001b[0m \u001b[2min 5ms\u001b[0m\u001b[0m \u001b[0m\n",
|
||||||
|
" \u001b[32m+\u001b[39m \u001b[1met-xmlfile\u001b[0m\u001b[2m==2.0.0\u001b[0m\n",
|
||||||
|
" \u001b[32m+\u001b[39m \u001b[1mopenpyxl\u001b[0m\u001b[2m==3.1.5\u001b[0m\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!uv add pandas requests openai dotenv openpyxl"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "941d511f-ad72-4306-bbab-52127583e513",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import dotenv\n",
|
||||||
|
"import openai\n",
|
||||||
|
"import sqlite3\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"dotenv.load_dotenv() # Copy .env.example to .env and fill in the blanks\n",
|
||||||
|
"\n",
|
||||||
|
"oai_token = os.getenv(\"OPENAI_API_KEY\")\n",
|
||||||
|
"\n",
|
||||||
|
"oai = openai.OpenAI(api_key=oai_token)\n",
|
||||||
|
"onet = sqlite3.connect(\"onet.database\") # Run ./create_onet_database.sh to create it\n",
|
||||||
|
"# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work\n",
|
||||||
|
"# It contains labels for whethere a O*NET task can be done remotely or not (labeled by GPT-4o)\n",
|
||||||
|
"# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing\n",
|
||||||
|
"df_remote_status = pd.read_csv(\"epoch_task_data.csv\")\n",
|
||||||
|
"\n",
|
||||||
|
"# BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip\n",
|
||||||
|
"df_oesm = pd.read_excel(\"oesm23national.xlsx\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Run uv run enrich_task_ratings.py to get this file (trs = Task RatingS)\n",
|
||||||
|
"df_enriched_trs = pd.read_json(\"task_ratings_enriched.json\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "a5351f8b-c2ad-4d3e-af4a-992f539a6064",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"FREQUENCY_MAP = {\n",
|
||||||
|
" 'frequency_category_1': \"Yearly or less\",\n",
|
||||||
|
" 'frequency_category_2': \"More than yearly\",\n",
|
||||||
|
" 'frequency_category_3': \"More than monthly\",\n",
|
||||||
|
" 'frequency_category_4': \"More than weekly\",\n",
|
||||||
|
" 'frequency_category_5': \"Daily\",\n",
|
||||||
|
" 'frequency_category_6': \"Several times daily\",\n",
|
||||||
|
" 'frequency_category_7': \"Hourly or more\"\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "8b2ab22a-afab-41f9-81a3-48eab261b568",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"background_prompt = '''\n",
|
||||||
|
"Estimate the typical duration to complete *one instance* of the following job task from the moment a person starts to work on it to the last moment the person will need to keep it in mind\n",
|
||||||
|
"\n",
|
||||||
|
"Take into account that there might be delays between the steps to complete the task, which would lengthen the estimate.\n",
|
||||||
|
"\n",
|
||||||
|
"Output a range with the format [duration A] - [duration B] where [duration A] and [duration B] correspond to one of the durations below:\n",
|
||||||
|
"- less than 30 minutes\n",
|
||||||
|
"- 30 minutes\n",
|
||||||
|
"- 1 hour\n",
|
||||||
|
"- 4 hours\n",
|
||||||
|
"- 8 hours\n",
|
||||||
|
"- 16 hours\n",
|
||||||
|
"- 3 days\n",
|
||||||
|
"- 1 week\n",
|
||||||
|
"- 3 weeks\n",
|
||||||
|
"- 6 weeks\n",
|
||||||
|
"- 3 months\n",
|
||||||
|
"- 6 months\n",
|
||||||
|
"- 1 year\n",
|
||||||
|
"- 3 years\n",
|
||||||
|
"- more than 3 year\n",
|
||||||
|
"\n",
|
||||||
|
"**Do not output anything besides the range**\n",
|
||||||
|
"'''"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "d2e4a855-f327-4b3d-ad0b-ed997e720639",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
|
||||||
|
"df_enriched_trs['occ_code_join'] = df_enriched_trs['onetsoc_code'].str[:7]\n",
|
||||||
|
"df_merged = pd.merge(\n",
|
||||||
|
" df_enriched_trs,\n",
|
||||||
|
" df_oesm_detailed,\n",
|
||||||
|
" left_on='occ_code_join',\n",
|
||||||
|
" right_on='OCC_CODE',\n",
|
||||||
|
" how='left'\n",
|
||||||
|
")\n",
|
||||||
|
"df_merged = df_merged.drop(columns=['occ_code_join'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "9be7acb5-2374-4f61-bba3-13b0077c0bd2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Task: Develop or recommend network security measures, such as firewalls, network security audits, or automated security probes.\n",
|
||||||
|
"Occupation Description: Design and implement computer and information networks, such as local area networks (LAN), wide area networks (WAN), intranets, extranets, and other data communications networks. Perform network modeling, analysis, and planning, including analysis of capacity needs for network infrastructures. May also design network and computer security measures. May research and recommend network and data communications hardware and software.\n",
|
||||||
|
"Occupation Title: Computer Network Architects\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"onetsoc_code 15-1241.00\n",
|
||||||
|
"task_id 18971\n",
|
||||||
|
"task Develop or recommend network security measures...\n",
|
||||||
|
"occupation_title Computer Network Architects\n",
|
||||||
|
"occupation_description Design and implement computer and information ...\n",
|
||||||
|
"Yearly or less 0.0\n",
|
||||||
|
"More than yearly 30.0\n",
|
||||||
|
"More than monthly 15.0\n",
|
||||||
|
"More than weekly 20.0\n",
|
||||||
|
"Daily 15.0\n",
|
||||||
|
"Several times daily 15.0\n",
|
||||||
|
"Hourly or more 5.0\n",
|
||||||
|
"importance_average 4.35\n",
|
||||||
|
"relevance_average 100.0\n",
|
||||||
|
"occ_code_join 15-1241\n",
|
||||||
|
"remote remote\n",
|
||||||
|
"Name: 45200, dtype: object"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"df_merged = pd \\\n",
|
||||||
|
" .merge(left=df_enriched_trs, right=df_remote_status[['O*NET-SOC Code', 'Remote']], how='left', left_on='onetsoc_code', right_on='O*NET-SOC Code') \\\n",
|
||||||
|
" .drop(columns=['O*NET-SOC Code']) \\\n",
|
||||||
|
" .rename(columns={'Remote': 'remote'}) \\\n",
|
||||||
|
" .rename(columns=FREQUENCY_MAP) \\\n",
|
||||||
|
" .query('remote == \"remote\" and importance_average >= 3')\n",
|
||||||
|
"\n",
|
||||||
|
"row = df_merged.iloc[30000]\n",
|
||||||
|
"print('Task: ', row['task'])\n",
|
||||||
|
"print('Occupation Description: ', row['occupation_description'])\n",
|
||||||
|
"print('Occupation Title: ', row['occupation_title'])\n",
|
||||||
|
"\n",
|
||||||
|
"row"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "9e5ea89f-2c18-459d-851d-dacb379f4a2e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>onetsoc_code</th>\n",
|
||||||
|
" <th>task_id</th>\n",
|
||||||
|
" <th>task</th>\n",
|
||||||
|
" <th>occupation_title</th>\n",
|
||||||
|
" <th>occupation_description</th>\n",
|
||||||
|
" <th>Yearly or less</th>\n",
|
||||||
|
" <th>More than yearly</th>\n",
|
||||||
|
" <th>More than monthly</th>\n",
|
||||||
|
" <th>More than weekly</th>\n",
|
||||||
|
" <th>Daily</th>\n",
|
||||||
|
" <th>Several times daily</th>\n",
|
||||||
|
" <th>Hourly or more</th>\n",
|
||||||
|
" <th>importance_average</th>\n",
|
||||||
|
" <th>relevance_average</th>\n",
|
||||||
|
" <th>remote</th>\n",
|
||||||
|
" <th>OCC_CODE</th>\n",
|
||||||
|
" <th>TOT_EMP</th>\n",
|
||||||
|
" <th>H_MEAN</th>\n",
|
||||||
|
" <th>A_MEAN</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>11-1011.00</td>\n",
|
||||||
|
" <td>8823</td>\n",
|
||||||
|
" <td>Direct or coordinate an organization's financi...</td>\n",
|
||||||
|
" <td>Chief Executives</td>\n",
|
||||||
|
" <td>Determine and formulate policies and provide o...</td>\n",
|
||||||
|
" <td>5.92</td>\n",
|
||||||
|
" <td>15.98</td>\n",
|
||||||
|
" <td>29.68</td>\n",
|
||||||
|
" <td>21.18</td>\n",
|
||||||
|
" <td>19.71</td>\n",
|
||||||
|
" <td>4.91</td>\n",
|
||||||
|
" <td>2.63</td>\n",
|
||||||
|
" <td>4.52</td>\n",
|
||||||
|
" <td>74.44</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>11-1011</td>\n",
|
||||||
|
" <td>211230.0</td>\n",
|
||||||
|
" <td>124.47</td>\n",
|
||||||
|
" <td>258900</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>11-1011.00</td>\n",
|
||||||
|
" <td>8823</td>\n",
|
||||||
|
" <td>Direct or coordinate an organization's financi...</td>\n",
|
||||||
|
" <td>Chief Executives</td>\n",
|
||||||
|
" <td>Determine and formulate policies and provide o...</td>\n",
|
||||||
|
" <td>5.92</td>\n",
|
||||||
|
" <td>15.98</td>\n",
|
||||||
|
" <td>29.68</td>\n",
|
||||||
|
" <td>21.18</td>\n",
|
||||||
|
" <td>19.71</td>\n",
|
||||||
|
" <td>4.91</td>\n",
|
||||||
|
" <td>2.63</td>\n",
|
||||||
|
" <td>4.52</td>\n",
|
||||||
|
" <td>74.44</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>11-1011</td>\n",
|
||||||
|
" <td>211230.0</td>\n",
|
||||||
|
" <td>124.47</td>\n",
|
||||||
|
" <td>258900</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>11-1011.00</td>\n",
|
||||||
|
" <td>8823</td>\n",
|
||||||
|
" <td>Direct or coordinate an organization's financi...</td>\n",
|
||||||
|
" <td>Chief Executives</td>\n",
|
||||||
|
" <td>Determine and formulate policies and provide o...</td>\n",
|
||||||
|
" <td>5.92</td>\n",
|
||||||
|
" <td>15.98</td>\n",
|
||||||
|
" <td>29.68</td>\n",
|
||||||
|
" <td>21.18</td>\n",
|
||||||
|
" <td>19.71</td>\n",
|
||||||
|
" <td>4.91</td>\n",
|
||||||
|
" <td>2.63</td>\n",
|
||||||
|
" <td>4.52</td>\n",
|
||||||
|
" <td>74.44</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>11-1011</td>\n",
|
||||||
|
" <td>211230.0</td>\n",
|
||||||
|
" <td>124.47</td>\n",
|
||||||
|
" <td>258900</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>11-1011.00</td>\n",
|
||||||
|
" <td>8823</td>\n",
|
||||||
|
" <td>Direct or coordinate an organization's financi...</td>\n",
|
||||||
|
" <td>Chief Executives</td>\n",
|
||||||
|
" <td>Determine and formulate policies and provide o...</td>\n",
|
||||||
|
" <td>5.92</td>\n",
|
||||||
|
" <td>15.98</td>\n",
|
||||||
|
" <td>29.68</td>\n",
|
||||||
|
" <td>21.18</td>\n",
|
||||||
|
" <td>19.71</td>\n",
|
||||||
|
" <td>4.91</td>\n",
|
||||||
|
" <td>2.63</td>\n",
|
||||||
|
" <td>4.52</td>\n",
|
||||||
|
" <td>74.44</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>11-1011</td>\n",
|
||||||
|
" <td>211230.0</td>\n",
|
||||||
|
" <td>124.47</td>\n",
|
||||||
|
" <td>258900</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>11-1011.00</td>\n",
|
||||||
|
" <td>8823</td>\n",
|
||||||
|
" <td>Direct or coordinate an organization's financi...</td>\n",
|
||||||
|
" <td>Chief Executives</td>\n",
|
||||||
|
" <td>Determine and formulate policies and provide o...</td>\n",
|
||||||
|
" <td>5.92</td>\n",
|
||||||
|
" <td>15.98</td>\n",
|
||||||
|
" <td>29.68</td>\n",
|
||||||
|
" <td>21.18</td>\n",
|
||||||
|
" <td>19.71</td>\n",
|
||||||
|
" <td>4.91</td>\n",
|
||||||
|
" <td>2.63</td>\n",
|
||||||
|
" <td>4.52</td>\n",
|
||||||
|
" <td>74.44</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>11-1011</td>\n",
|
||||||
|
" <td>211230.0</td>\n",
|
||||||
|
" <td>124.47</td>\n",
|
||||||
|
" <td>258900</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>127653</th>\n",
|
||||||
|
" <td>53-7121.00</td>\n",
|
||||||
|
" <td>12807</td>\n",
|
||||||
|
" <td>Unload cars containing liquids by connecting h...</td>\n",
|
||||||
|
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
|
||||||
|
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
|
||||||
|
" <td>6.05</td>\n",
|
||||||
|
" <td>29.21</td>\n",
|
||||||
|
" <td>6.88</td>\n",
|
||||||
|
" <td>13.95</td>\n",
|
||||||
|
" <td>27.65</td>\n",
|
||||||
|
" <td>7.93</td>\n",
|
||||||
|
" <td>8.34</td>\n",
|
||||||
|
" <td>4.08</td>\n",
|
||||||
|
" <td>64.04</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>53-7121</td>\n",
|
||||||
|
" <td>11400.0</td>\n",
|
||||||
|
" <td>29.1</td>\n",
|
||||||
|
" <td>60530</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>127654</th>\n",
|
||||||
|
" <td>53-7121.00</td>\n",
|
||||||
|
" <td>12804</td>\n",
|
||||||
|
" <td>Clean interiors of tank cars or tank trucks, u...</td>\n",
|
||||||
|
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
|
||||||
|
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
|
||||||
|
" <td>1.47</td>\n",
|
||||||
|
" <td>6.33</td>\n",
|
||||||
|
" <td>21.70</td>\n",
|
||||||
|
" <td>25.69</td>\n",
|
||||||
|
" <td>32.35</td>\n",
|
||||||
|
" <td>12.47</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>4.02</td>\n",
|
||||||
|
" <td>44.33</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>53-7121</td>\n",
|
||||||
|
" <td>11400.0</td>\n",
|
||||||
|
" <td>29.1</td>\n",
|
||||||
|
" <td>60530</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>127655</th>\n",
|
||||||
|
" <td>53-7121.00</td>\n",
|
||||||
|
" <td>12803</td>\n",
|
||||||
|
" <td>Lower gauge rods into tanks or read meters to ...</td>\n",
|
||||||
|
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
|
||||||
|
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
|
||||||
|
" <td>4.52</td>\n",
|
||||||
|
" <td>1.76</td>\n",
|
||||||
|
" <td>4.65</td>\n",
|
||||||
|
" <td>17.81</td>\n",
|
||||||
|
" <td>37.42</td>\n",
|
||||||
|
" <td>23.31</td>\n",
|
||||||
|
" <td>10.55</td>\n",
|
||||||
|
" <td>3.88</td>\n",
|
||||||
|
" <td>65.00</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>53-7121</td>\n",
|
||||||
|
" <td>11400.0</td>\n",
|
||||||
|
" <td>29.1</td>\n",
|
||||||
|
" <td>60530</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>127656</th>\n",
|
||||||
|
" <td>53-7121.00</td>\n",
|
||||||
|
" <td>12805</td>\n",
|
||||||
|
" <td>Operate conveyors and equipment to transfer gr...</td>\n",
|
||||||
|
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
|
||||||
|
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
|
||||||
|
" <td>6.97</td>\n",
|
||||||
|
" <td>12.00</td>\n",
|
||||||
|
" <td>2.52</td>\n",
|
||||||
|
" <td>5.90</td>\n",
|
||||||
|
" <td>35.48</td>\n",
|
||||||
|
" <td>22.08</td>\n",
|
||||||
|
" <td>15.05</td>\n",
|
||||||
|
" <td>3.87</td>\n",
|
||||||
|
" <td>47.90</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>53-7121</td>\n",
|
||||||
|
" <td>11400.0</td>\n",
|
||||||
|
" <td>29.1</td>\n",
|
||||||
|
" <td>60530</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>127657</th>\n",
|
||||||
|
" <td>53-7121.00</td>\n",
|
||||||
|
" <td>12810</td>\n",
|
||||||
|
" <td>Perform general warehouse activities, such as ...</td>\n",
|
||||||
|
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
|
||||||
|
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
|
||||||
|
" <td>5.91</td>\n",
|
||||||
|
" <td>10.85</td>\n",
|
||||||
|
" <td>6.46</td>\n",
|
||||||
|
" <td>14.46</td>\n",
|
||||||
|
" <td>34.14</td>\n",
|
||||||
|
" <td>16.39</td>\n",
|
||||||
|
" <td>11.78</td>\n",
|
||||||
|
" <td>3.53</td>\n",
|
||||||
|
" <td>47.84</td>\n",
|
||||||
|
" <td>remote</td>\n",
|
||||||
|
" <td>53-7121</td>\n",
|
||||||
|
" <td>11400.0</td>\n",
|
||||||
|
" <td>29.1</td>\n",
|
||||||
|
" <td>60530</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>127658 rows × 19 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" onetsoc_code task_id \\\n",
|
||||||
|
"0 11-1011.00 8823 \n",
|
||||||
|
"1 11-1011.00 8823 \n",
|
||||||
|
"2 11-1011.00 8823 \n",
|
||||||
|
"3 11-1011.00 8823 \n",
|
||||||
|
"4 11-1011.00 8823 \n",
|
||||||
|
"... ... ... \n",
|
||||||
|
"127653 53-7121.00 12807 \n",
|
||||||
|
"127654 53-7121.00 12804 \n",
|
||||||
|
"127655 53-7121.00 12803 \n",
|
||||||
|
"127656 53-7121.00 12805 \n",
|
||||||
|
"127657 53-7121.00 12810 \n",
|
||||||
|
"\n",
|
||||||
|
" task \\\n",
|
||||||
|
"0 Direct or coordinate an organization's financi... \n",
|
||||||
|
"1 Direct or coordinate an organization's financi... \n",
|
||||||
|
"2 Direct or coordinate an organization's financi... \n",
|
||||||
|
"3 Direct or coordinate an organization's financi... \n",
|
||||||
|
"4 Direct or coordinate an organization's financi... \n",
|
||||||
|
"... ... \n",
|
||||||
|
"127653 Unload cars containing liquids by connecting h... \n",
|
||||||
|
"127654 Clean interiors of tank cars or tank trucks, u... \n",
|
||||||
|
"127655 Lower gauge rods into tanks or read meters to ... \n",
|
||||||
|
"127656 Operate conveyors and equipment to transfer gr... \n",
|
||||||
|
"127657 Perform general warehouse activities, such as ... \n",
|
||||||
|
"\n",
|
||||||
|
" occupation_title \\\n",
|
||||||
|
"0 Chief Executives \n",
|
||||||
|
"1 Chief Executives \n",
|
||||||
|
"2 Chief Executives \n",
|
||||||
|
"3 Chief Executives \n",
|
||||||
|
"4 Chief Executives \n",
|
||||||
|
"... ... \n",
|
||||||
|
"127653 Tank Car, Truck, and Ship Loaders \n",
|
||||||
|
"127654 Tank Car, Truck, and Ship Loaders \n",
|
||||||
|
"127655 Tank Car, Truck, and Ship Loaders \n",
|
||||||
|
"127656 Tank Car, Truck, and Ship Loaders \n",
|
||||||
|
"127657 Tank Car, Truck, and Ship Loaders \n",
|
||||||
|
"\n",
|
||||||
|
" occupation_description Yearly or less \\\n",
|
||||||
|
"0 Determine and formulate policies and provide o... 5.92 \n",
|
||||||
|
"1 Determine and formulate policies and provide o... 5.92 \n",
|
||||||
|
"2 Determine and formulate policies and provide o... 5.92 \n",
|
||||||
|
"3 Determine and formulate policies and provide o... 5.92 \n",
|
||||||
|
"4 Determine and formulate policies and provide o... 5.92 \n",
|
||||||
|
"... ... ... \n",
|
||||||
|
"127653 Load and unload chemicals and bulk solids, suc... 6.05 \n",
|
||||||
|
"127654 Load and unload chemicals and bulk solids, suc... 1.47 \n",
|
||||||
|
"127655 Load and unload chemicals and bulk solids, suc... 4.52 \n",
|
||||||
|
"127656 Load and unload chemicals and bulk solids, suc... 6.97 \n",
|
||||||
|
"127657 Load and unload chemicals and bulk solids, suc... 5.91 \n",
|
||||||
|
"\n",
|
||||||
|
" More than yearly More than monthly More than weekly Daily \\\n",
|
||||||
|
"0 15.98 29.68 21.18 19.71 \n",
|
||||||
|
"1 15.98 29.68 21.18 19.71 \n",
|
||||||
|
"2 15.98 29.68 21.18 19.71 \n",
|
||||||
|
"3 15.98 29.68 21.18 19.71 \n",
|
||||||
|
"4 15.98 29.68 21.18 19.71 \n",
|
||||||
|
"... ... ... ... ... \n",
|
||||||
|
"127653 29.21 6.88 13.95 27.65 \n",
|
||||||
|
"127654 6.33 21.70 25.69 32.35 \n",
|
||||||
|
"127655 1.76 4.65 17.81 37.42 \n",
|
||||||
|
"127656 12.00 2.52 5.90 35.48 \n",
|
||||||
|
"127657 10.85 6.46 14.46 34.14 \n",
|
||||||
|
"\n",
|
||||||
|
" Several times daily Hourly or more importance_average \\\n",
|
||||||
|
"0 4.91 2.63 4.52 \n",
|
||||||
|
"1 4.91 2.63 4.52 \n",
|
||||||
|
"2 4.91 2.63 4.52 \n",
|
||||||
|
"3 4.91 2.63 4.52 \n",
|
||||||
|
"4 4.91 2.63 4.52 \n",
|
||||||
|
"... ... ... ... \n",
|
||||||
|
"127653 7.93 8.34 4.08 \n",
|
||||||
|
"127654 12.47 0.00 4.02 \n",
|
||||||
|
"127655 23.31 10.55 3.88 \n",
|
||||||
|
"127656 22.08 15.05 3.87 \n",
|
||||||
|
"127657 16.39 11.78 3.53 \n",
|
||||||
|
"\n",
|
||||||
|
" relevance_average remote OCC_CODE TOT_EMP H_MEAN A_MEAN \n",
|
||||||
|
"0 74.44 remote 11-1011 211230.0 124.47 258900 \n",
|
||||||
|
"1 74.44 remote 11-1011 211230.0 124.47 258900 \n",
|
||||||
|
"2 74.44 remote 11-1011 211230.0 124.47 258900 \n",
|
||||||
|
"3 74.44 remote 11-1011 211230.0 124.47 258900 \n",
|
||||||
|
"4 74.44 remote 11-1011 211230.0 124.47 258900 \n",
|
||||||
|
"... ... ... ... ... ... ... \n",
|
||||||
|
"127653 64.04 remote 53-7121 11400.0 29.1 60530 \n",
|
||||||
|
"127654 44.33 remote 53-7121 11400.0 29.1 60530 \n",
|
||||||
|
"127655 65.00 remote 53-7121 11400.0 29.1 60530 \n",
|
||||||
|
"127656 47.90 remote 53-7121 11400.0 29.1 60530 \n",
|
||||||
|
"127657 47.84 remote 53-7121 11400.0 29.1 60530 \n",
|
||||||
|
"\n",
|
||||||
|
"[127658 rows x 19 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Cross-reference woth BLS OEWS\n",
|
||||||
|
"# It doesn't really make sens to have it per-task, we only need it per-occupation...\n",
|
||||||
|
"df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
|
||||||
|
"df_merged['occ_code_join'] = df_merged['onetsoc_code'].str[:7]\n",
|
||||||
|
"df_merged = pd.merge(\n",
|
||||||
|
" df_merged,\n",
|
||||||
|
" df_oesm_detailed,\n",
|
||||||
|
" left_on='occ_code_join',\n",
|
||||||
|
" right_on='OCC_CODE',\n",
|
||||||
|
" how='left'\n",
|
||||||
|
")\n",
|
||||||
|
"df_merged = df_merged.drop(columns=['occ_code_join'])\n",
|
||||||
|
"df_merged"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 76,
|
||||||
|
"id": "08f45d91-039d-4ec0-94a2-f305a3312e6a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Why did the scarecrow win an award?\n",
|
||||||
|
"\n",
|
||||||
|
"Because he was outstanding in his field!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"response = oai.chat.completions.create(messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}], model=\"gpt-4.1-2025-04-14\", max_tokens=100, temperature=0.7, n=1, stop=None)\n",
|
||||||
|
"joke = response.choices[0].message.content.strip()\n",
|
||||||
|
"print(joke)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
85
create_onet_database.sh
Executable file
85
create_onet_database.sh
Executable file
|
@ -0,0 +1,85 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Set database name and directories
|
||||||
|
ONET_DB_NAME="onet.database"
|
||||||
|
ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
|
||||||
|
ONET_ZIP_FILE="db_29_1_mysql.zip"
|
||||||
|
ONET_EXTRACT_DIR="db_29_1_mysql"
|
||||||
|
|
||||||
|
# Download O*NET database only if not already downloaded
|
||||||
|
if [ ! -f "$ONET_ZIP_FILE" ]; then
|
||||||
|
echo "Downloading O*NET database from $ONET_ZIP_URL"
|
||||||
|
curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed to download O*NET database"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Using existing O*NET database zip file"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract downloaded zip file only if extraction directory doesn't exist
|
||||||
|
if [ ! -d "$ONET_EXTRACT_DIR" ]; then
|
||||||
|
echo "Extracting O*NET database files"
|
||||||
|
unzip -o "$ONET_ZIP_FILE"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed to extract O*NET database files"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Using existing extracted O*NET database files"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Remove existing database if it exists
|
||||||
|
if [ -f "$ONET_DB_NAME" ]; then
|
||||||
|
echo "Removing existing database"
|
||||||
|
rm "$ONET_DB_NAME"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create a new SQLite database with optimized settings for fast import
|
||||||
|
echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
|
||||||
|
sqlite3 "$ONET_DB_NAME" << EOF
|
||||||
|
PRAGMA journal_mode = OFF;
|
||||||
|
PRAGMA synchronous = 0;
|
||||||
|
PRAGMA cache_size = 1000000;
|
||||||
|
PRAGMA locking_mode = EXCLUSIVE;
|
||||||
|
PRAGMA temp_store = MEMORY;
|
||||||
|
PRAGMA foreign_keys = ON;
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Combine and execute all SQL files in one transaction
|
||||||
|
echo "Executing SQL files in alphabetical order (single transaction mode)"
|
||||||
|
sqlite3 "$ONET_DB_NAME" << EOF
|
||||||
|
BEGIN TRANSACTION;
|
||||||
|
$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
|
||||||
|
COMMIT;
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Check if the execution was successful
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Error executing SQL files in batch transaction"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "Database populated successfully. Restoring reliability settings..."
|
||||||
|
|
||||||
|
# Restore reliability-focused settings after import
|
||||||
|
sqlite3 "$ONET_DB_NAME" << EOF
|
||||||
|
PRAGMA journal_mode = WAL;
|
||||||
|
PRAGMA synchronous = NORMAL;
|
||||||
|
PRAGMA locking_mode = NORMAL;
|
||||||
|
PRAGMA temp_store = DEFAULT;
|
||||||
|
PRAGMA foreign_keys = ON;
|
||||||
|
PRAGMA optimize;
|
||||||
|
VACUUM;
|
||||||
|
EOF
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Warning: Failed to restore reliability settings, but database is populated"
|
||||||
|
else
|
||||||
|
echo "Reliability settings restored successfully"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "O*NET database created and optimized successfully!"
|
||||||
|
fi
|
223
enrich_task_ratings.py
Normal file
223
enrich_task_ratings.py
Normal file
|
@ -0,0 +1,223 @@
|
||||||
|
import sqlite3
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
import numpy as np # Import numpy for nan handling if necessary
|
||||||
|
|
||||||
|
# --- Configuration ---
|
||||||
|
DB_FILE = "onet.database"
|
||||||
|
OUTPUT_FILE = "task_ratings_enriched.json"
|
||||||
|
|
||||||
|
# --- Database Interaction ---
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_data_from_db(db_path):
|
||||||
|
"""
|
||||||
|
Fetches required data from the O*NET SQLite database using JOINs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_path (str): Path to the SQLite database file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pandas.DataFrame: DataFrame containing joined data from task_ratings,
|
||||||
|
task_statements, and occupation_data.
|
||||||
|
Returns None if the database file doesn't exist or an error occurs.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(db_path):
|
||||||
|
print(f"Error: Database file not found at {db_path}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
# Construct the SQL query to join the tables and select necessary columns
|
||||||
|
# We select all relevant columns needed for processing.
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
tr.onetsoc_code,
|
||||||
|
tr.task_id,
|
||||||
|
ts.task,
|
||||||
|
od.title AS occupation_title,
|
||||||
|
od.description AS occupation_description,
|
||||||
|
tr.scale_id,
|
||||||
|
tr.category,
|
||||||
|
tr.data_value
|
||||||
|
FROM
|
||||||
|
task_ratings tr
|
||||||
|
JOIN
|
||||||
|
task_statements ts ON tr.task_id = ts.task_id
|
||||||
|
JOIN
|
||||||
|
occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
|
||||||
|
"""
|
||||||
|
df = pd.read_sql_query(query, conn)
|
||||||
|
conn.close()
|
||||||
|
print(f"Successfully fetched {len(df)} records from the database.")
|
||||||
|
return df
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
print(f"SQLite error: {e}")
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred during data fetching: {e}")
|
||||||
|
if "conn" in locals() and conn:
|
||||||
|
conn.close()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# --- Data Processing ---
|
||||||
|
|
||||||
|
|
||||||
|
def process_task_ratings(df):
|
||||||
|
"""
|
||||||
|
Processes the fetched data to group, pivot frequency, calculate averages,
|
||||||
|
and structure the output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pandas.DataFrame): The input DataFrame with joined data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of dictionaries, each representing an enriched task rating.
|
||||||
|
Returns None if the input DataFrame is invalid.
|
||||||
|
"""
|
||||||
|
if df is None or df.empty:
|
||||||
|
print("Error: Input DataFrame is empty or invalid.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
print("Starting data processing...")
|
||||||
|
|
||||||
|
# --- 1. Handle Frequency (FT) ---
|
||||||
|
# Filter for Frequency ratings
|
||||||
|
freq_df = df[df["scale_id"] == "FT"].copy()
|
||||||
|
# Pivot the frequency data: index by task and occupation, columns by category
|
||||||
|
# We fill missing frequency values with 0, assuming no rating means 0% for that category.
|
||||||
|
freq_pivot = freq_df.pivot_table(
|
||||||
|
index=["onetsoc_code", "task_id"],
|
||||||
|
columns="category",
|
||||||
|
values="data_value",
|
||||||
|
fill_value=0, # Fill missing categories for a task/occupation with 0
|
||||||
|
)
|
||||||
|
# Rename columns for clarity using the requested format
|
||||||
|
freq_pivot.columns = [
|
||||||
|
f"frequency_category_{int(col)}" for col in freq_pivot.columns
|
||||||
|
] # <-- UPDATED LINE
|
||||||
|
print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
|
||||||
|
|
||||||
|
# --- 2. Handle Importance (IM, IJ) ---
|
||||||
|
# Filter for Importance ratings
|
||||||
|
imp_df = df[df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||||
|
# Group by task and occupation, calculate the mean importance
|
||||||
|
# Using np.nanmean to handle potential NaN values gracefully if any exist
|
||||||
|
imp_avg = (
|
||||||
|
imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||||
|
)
|
||||||
|
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||||
|
print(f"Processed Importance data. Shape: {imp_avg.shape}")
|
||||||
|
|
||||||
|
# --- 3. Handle Relevance (RT) ---
|
||||||
|
# Filter for Relevance ratings
|
||||||
|
rel_df = df[df["scale_id"] == "RT"].copy()
|
||||||
|
# Group by task and occupation, calculate the mean relevance
|
||||||
|
rel_avg = (
|
||||||
|
rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||||
|
)
|
||||||
|
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||||
|
print(f"Processed Relevance data. Shape: {rel_avg.shape}")
|
||||||
|
|
||||||
|
# --- 4. Get Base Task/Occupation Info ---
|
||||||
|
# Select unique combinations of task and occupation details
|
||||||
|
base_info = (
|
||||||
|
df[
|
||||||
|
[
|
||||||
|
"onetsoc_code",
|
||||||
|
"task_id",
|
||||||
|
"task",
|
||||||
|
"occupation_title",
|
||||||
|
"occupation_description",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
.drop_duplicates()
|
||||||
|
.set_index(["onetsoc_code", "task_id"])
|
||||||
|
)
|
||||||
|
print(f"Extracted base info. Shape: {base_info.shape}")
|
||||||
|
|
||||||
|
# --- 5. Merge Processed Data ---
|
||||||
|
# Start with the base info and merge the calculated/pivoted data
|
||||||
|
# Use 'left' joins to ensure all tasks/occupations from the base_info are kept.
|
||||||
|
# If a task/occupation doesn't have frequency, importance, or relevance ratings,
|
||||||
|
# the corresponding columns will have NaN values after the merge.
|
||||||
|
print("Merging processed data...")
|
||||||
|
final_df = base_info.merge(
|
||||||
|
freq_pivot, left_index=True, right_index=True, how="left"
|
||||||
|
)
|
||||||
|
# Set index before merging averages which are not multi-indexed
|
||||||
|
final_df = final_df.reset_index()
|
||||||
|
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||||
|
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||||
|
|
||||||
|
# Fill potential NaN values resulting from left joins if needed.
|
||||||
|
# For averages, NaN might mean no rating was provided. We can leave them as NaN
|
||||||
|
# or fill with 0 or another placeholder depending on desired interpretation.
|
||||||
|
# For frequency categories, NaN could mean that category wasn't rated. We filled with 0 during pivot.
|
||||||
|
# Example: Fill NaN averages with 0
|
||||||
|
# final_df['importance_average'].fillna(0, inplace=True)
|
||||||
|
# final_df['relevance_average'].fillna(0, inplace=True)
|
||||||
|
# Note: Leaving NaNs might be more informative.
|
||||||
|
|
||||||
|
print(f"Final merged data shape: {final_df.shape}")
|
||||||
|
|
||||||
|
# Convert DataFrame to list of dictionaries for JSON output
|
||||||
|
# Handle potential NaN values during JSON conversion
|
||||||
|
final_df = final_df.replace(
|
||||||
|
{np.nan: None}
|
||||||
|
) # Replace numpy NaN with Python None for JSON compatibility
|
||||||
|
result_list = final_df.to_dict(orient="records")
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
# --- Output ---
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_json(data, output_path):
|
||||||
|
"""
|
||||||
|
Writes the processed data to a JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (list): The list of dictionaries to write.
|
||||||
|
output_path (str): Path to the output JSON file.
|
||||||
|
"""
|
||||||
|
if data is None:
|
||||||
|
print("No data to write to JSON.")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, indent=4, ensure_ascii=False)
|
||||||
|
print(f"Successfully wrote enriched data to {output_path}")
|
||||||
|
except IOError as e:
|
||||||
|
print(f"Error writing JSON file: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An unexpected error occurred during JSON writing: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# --- Main Execution ---
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Starting O*NET Task Ratings Enrichment Script...")
|
||||||
|
# 1. Fetch data
|
||||||
|
raw_data_df = fetch_data_from_db(DB_FILE)
|
||||||
|
|
||||||
|
# 2. Process data
|
||||||
|
if raw_data_df is not None:
|
||||||
|
enriched_data = process_task_ratings(raw_data_df)
|
||||||
|
|
||||||
|
# 3. Write output
|
||||||
|
if enriched_data:
|
||||||
|
write_to_json(enriched_data, OUTPUT_FILE)
|
||||||
|
else:
|
||||||
|
print("Data processing failed. No output file generated.")
|
||||||
|
else:
|
||||||
|
print("Data fetching failed. Script terminated.")
|
||||||
|
|
||||||
|
print("Script finished.")
|
99
flake.lock
generated
Normal file
99
flake.lock
generated
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1745526057,
|
||||||
|
"narHash": "sha256-ITSpPDwvLBZBnPRS2bUcHY3gZSwis/uTe255QgMtTLA=",
|
||||||
|
"owner": "NixOS",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "f771eb401a46846c1aebd20552521b233dd7e18b",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "NixOS",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pyproject-build-systems": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
],
|
||||||
|
"pyproject-nix": [
|
||||||
|
"pyproject-nix"
|
||||||
|
],
|
||||||
|
"uv2nix": [
|
||||||
|
"uv2nix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1744599653,
|
||||||
|
"narHash": "sha256-nysSwVVjG4hKoOjhjvE6U5lIKA8sEr1d1QzEfZsannU=",
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "build-system-pkgs",
|
||||||
|
"rev": "7dba6dbc73120e15b558754c26024f6c93015dd7",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "build-system-pkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pyproject-nix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1743438845,
|
||||||
|
"narHash": "sha256-1GSaoubGtvsLRwoYwHjeKYq40tLwvuFFVhGrG8J9Oek=",
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "pyproject.nix",
|
||||||
|
"rev": "8063ec98edc459571d042a640b1c5e334ecfca1e",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "pyproject.nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": "nixpkgs",
|
||||||
|
"pyproject-build-systems": "pyproject-build-systems",
|
||||||
|
"pyproject-nix": "pyproject-nix",
|
||||||
|
"uv2nix": "uv2nix"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"uv2nix": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": [
|
||||||
|
"nixpkgs"
|
||||||
|
],
|
||||||
|
"pyproject-nix": [
|
||||||
|
"pyproject-nix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1745328266,
|
||||||
|
"narHash": "sha256-ykgcOadiU9Z67P2MOjB0r06r35cQu65t0fzDeYR1uzc=",
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "uv2nix",
|
||||||
|
"rev": "bcadc56a1e90d89bf32cc4ac308d8252e2adf855",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "pyproject-nix",
|
||||||
|
"repo": "uv2nix",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
31
pyproject.toml
Normal file
31
pyproject.toml
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
[project]
|
||||||
|
name = "sprint-econtai"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.13"
|
||||||
|
dependencies = [
|
||||||
|
"dotenv>=0.9.9",
|
||||||
|
"jupyter>=1.1.1",
|
||||||
|
"notebook>=7.4.1",
|
||||||
|
"openai>=1.76.0",
|
||||||
|
"openpyxl>=3.1.5",
|
||||||
|
"pandas>=2.2.3",
|
||||||
|
"requests>=2.32.3",
|
||||||
|
"tqdm>=4.67.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
pythonpath="src"
|
||||||
|
addopts="-v"
|
||||||
|
asyncio_mode = "auto"
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 100
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = []
|
Loading…
Add table
Add a link
Reference in a new issue