sprint-econtai/archive/legacy.ipynb
Félix Dorn 43076bcbb1 old
2025-07-15 00:41:05 +02:00

628 lines
24 KiB
Text
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 86,
"id": "beace815-b5ae-44a4-a81c-a7f82cb66296",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2K\u001b[2mResolved \u001b[1m118 packages\u001b[0m \u001b[2min 386ms\u001b[0m\u001b[0m \u001b[0m\n",
"\u001b[2K\u001b[2mPrepared \u001b[1m2 packages\u001b[0m \u001b[2min 124ms\u001b[0m\u001b[0m \n",
"\u001b[2K\u001b[2mInstalled \u001b[1m2 packages\u001b[0m \u001b[2min 5ms\u001b[0m\u001b[0m \u001b[0m\n",
" \u001b[32m+\u001b[39m \u001b[1met-xmlfile\u001b[0m\u001b[2m==2.0.0\u001b[0m\n",
" \u001b[32m+\u001b[39m \u001b[1mopenpyxl\u001b[0m\u001b[2m==3.1.5\u001b[0m\n"
]
}
],
"source": [
"!uv add pandas requests openai dotenv openpyxl"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "941d511f-ad72-4306-bbab-52127583e513",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import dotenv\n",
"import openai\n",
"import sqlite3\n",
"import pandas as pd\n",
"\n",
"dotenv.load_dotenv() # Copy .env.example to .env and fill in the blanks\n",
"\n",
"oai_token = os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"oai = openai.OpenAI(api_key=oai_token)\n",
"onet = sqlite3.connect(\"onet.database\") # Run ./create_onet_database.sh to create it\n",
"# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work\n",
"# It contains labels for whethere a O*NET task can be done remotely or not (labeled by GPT-4o)\n",
"# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing\n",
"df_remote_status = pd.read_csv(\"epoch_task_data.csv\")\n",
"\n",
"# BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip\n",
"df_oesm = pd.read_excel(\"oesm23national.xlsx\")\n",
"\n",
"# Run uv run enrich_task_ratings.py to get this file (trs = Task RatingS)\n",
"df_enriched_trs = pd.read_json(\"task_ratings_enriched.json\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a5351f8b-c2ad-4d3e-af4a-992f539a6064",
"metadata": {},
"outputs": [],
"source": [
"FREQUENCY_MAP = {\n",
" 'frequency_category_1': \"Yearly or less\",\n",
" 'frequency_category_2': \"More than yearly\",\n",
" 'frequency_category_3': \"More than monthly\",\n",
" 'frequency_category_4': \"More than weekly\",\n",
" 'frequency_category_5': \"Daily\",\n",
" 'frequency_category_6': \"Several times daily\",\n",
" 'frequency_category_7': \"Hourly or more\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8b2ab22a-afab-41f9-81a3-48eab261b568",
"metadata": {},
"outputs": [],
"source": [
"background_prompt = '''\n",
"Estimate the typical duration to complete *one instance* of the following job task from the moment a person starts to work on it to the last moment the person will need to keep it in mind\n",
"\n",
"Take into account that there might be delays between the steps to complete the task, which would lengthen the estimate.\n",
"\n",
"Output a range with the format [duration A] - [duration B] where [duration A] and [duration B] correspond to one of the durations below:\n",
"- less than 30 minutes\n",
"- 30 minutes\n",
"- 1 hour\n",
"- 4 hours\n",
"- 8 hours\n",
"- 16 hours\n",
"- 3 days\n",
"- 1 week\n",
"- 3 weeks\n",
"- 6 weeks\n",
"- 3 months\n",
"- 6 months\n",
"- 1 year\n",
"- 3 years\n",
"- more than 3 year\n",
"\n",
"**Do not output anything besides the range**\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d2e4a855-f327-4b3d-ad0b-ed997e720639",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>onetsoc_code</th>\n",
" <th>task_id</th>\n",
" <th>task</th>\n",
" <th>occupation_title</th>\n",
" <th>occupation_description</th>\n",
" <th>frequency_category_1</th>\n",
" <th>frequency_category_2</th>\n",
" <th>frequency_category_3</th>\n",
" <th>frequency_category_4</th>\n",
" <th>frequency_category_5</th>\n",
" <th>frequency_category_6</th>\n",
" <th>frequency_category_7</th>\n",
" <th>importance_average</th>\n",
" <th>relevance_average</th>\n",
" <th>OCC_CODE</th>\n",
" <th>TOT_EMP</th>\n",
" <th>H_MEAN</th>\n",
" <th>A_MEAN</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11-1011.00</td>\n",
" <td>8823</td>\n",
" <td>Direct or coordinate an organization's financi...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>5.92</td>\n",
" <td>15.98</td>\n",
" <td>29.68</td>\n",
" <td>21.18</td>\n",
" <td>19.71</td>\n",
" <td>4.91</td>\n",
" <td>2.63</td>\n",
" <td>4.52</td>\n",
" <td>74.44</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11-1011.00</td>\n",
" <td>8824</td>\n",
" <td>Confer with board members, organization offici...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>1.42</td>\n",
" <td>14.44</td>\n",
" <td>27.31</td>\n",
" <td>25.52</td>\n",
" <td>26.88</td>\n",
" <td>2.52</td>\n",
" <td>1.90</td>\n",
" <td>4.32</td>\n",
" <td>81.71</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11-1011.00</td>\n",
" <td>8827</td>\n",
" <td>Prepare budgets for approval, including those ...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>15.50</td>\n",
" <td>38.21</td>\n",
" <td>32.73</td>\n",
" <td>5.15</td>\n",
" <td>5.25</td>\n",
" <td>0.19</td>\n",
" <td>2.98</td>\n",
" <td>4.30</td>\n",
" <td>93.41</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11-1011.00</td>\n",
" <td>8826</td>\n",
" <td>Direct, plan, or implement policies, objective...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>3.03</td>\n",
" <td>17.33</td>\n",
" <td>20.30</td>\n",
" <td>18.10</td>\n",
" <td>33.16</td>\n",
" <td>2.01</td>\n",
" <td>6.07</td>\n",
" <td>4.24</td>\n",
" <td>97.79</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11-1011.00</td>\n",
" <td>8834</td>\n",
" <td>Prepare or present reports concerning activiti...</td>\n",
" <td>Chief Executives</td>\n",
" <td>Determine and formulate policies and provide o...</td>\n",
" <td>1.98</td>\n",
" <td>14.06</td>\n",
" <td>42.60</td>\n",
" <td>21.24</td>\n",
" <td>13.18</td>\n",
" <td>6.24</td>\n",
" <td>0.70</td>\n",
" <td>4.17</td>\n",
" <td>92.92</td>\n",
" <td>11-1011</td>\n",
" <td>211230.0</td>\n",
" <td>124.47</td>\n",
" <td>258900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17634</th>\n",
" <td>53-7121.00</td>\n",
" <td>12807</td>\n",
" <td>Unload cars containing liquids by connecting h...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>6.05</td>\n",
" <td>29.21</td>\n",
" <td>6.88</td>\n",
" <td>13.95</td>\n",
" <td>27.65</td>\n",
" <td>7.93</td>\n",
" <td>8.34</td>\n",
" <td>4.08</td>\n",
" <td>64.04</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17635</th>\n",
" <td>53-7121.00</td>\n",
" <td>12804</td>\n",
" <td>Clean interiors of tank cars or tank trucks, u...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>1.47</td>\n",
" <td>6.33</td>\n",
" <td>21.70</td>\n",
" <td>25.69</td>\n",
" <td>32.35</td>\n",
" <td>12.47</td>\n",
" <td>0.00</td>\n",
" <td>4.02</td>\n",
" <td>44.33</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17636</th>\n",
" <td>53-7121.00</td>\n",
" <td>12803</td>\n",
" <td>Lower gauge rods into tanks or read meters to ...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>4.52</td>\n",
" <td>1.76</td>\n",
" <td>4.65</td>\n",
" <td>17.81</td>\n",
" <td>37.42</td>\n",
" <td>23.31</td>\n",
" <td>10.55</td>\n",
" <td>3.88</td>\n",
" <td>65.00</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17637</th>\n",
" <td>53-7121.00</td>\n",
" <td>12805</td>\n",
" <td>Operate conveyors and equipment to transfer gr...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>6.97</td>\n",
" <td>12.00</td>\n",
" <td>2.52</td>\n",
" <td>5.90</td>\n",
" <td>35.48</td>\n",
" <td>22.08</td>\n",
" <td>15.05</td>\n",
" <td>3.87</td>\n",
" <td>47.90</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17638</th>\n",
" <td>53-7121.00</td>\n",
" <td>12810</td>\n",
" <td>Perform general warehouse activities, such as ...</td>\n",
" <td>Tank Car, Truck, and Ship Loaders</td>\n",
" <td>Load and unload chemicals and bulk solids, suc...</td>\n",
" <td>5.91</td>\n",
" <td>10.85</td>\n",
" <td>6.46</td>\n",
" <td>14.46</td>\n",
" <td>34.14</td>\n",
" <td>16.39</td>\n",
" <td>11.78</td>\n",
" <td>3.53</td>\n",
" <td>47.84</td>\n",
" <td>53-7121</td>\n",
" <td>11400.0</td>\n",
" <td>29.1</td>\n",
" <td>60530</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17639 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [
" onetsoc_code task_id \\\n",
"0 11-1011.00 8823 \n",
"1 11-1011.00 8824 \n",
"2 11-1011.00 8827 \n",
"3 11-1011.00 8826 \n",
"4 11-1011.00 8834 \n",
"... ... ... \n",
"17634 53-7121.00 12807 \n",
"17635 53-7121.00 12804 \n",
"17636 53-7121.00 12803 \n",
"17637 53-7121.00 12805 \n",
"17638 53-7121.00 12810 \n",
"\n",
" task \\\n",
"0 Direct or coordinate an organization's financi... \n",
"1 Confer with board members, organization offici... \n",
"2 Prepare budgets for approval, including those ... \n",
"3 Direct, plan, or implement policies, objective... \n",
"4 Prepare or present reports concerning activiti... \n",
"... ... \n",
"17634 Unload cars containing liquids by connecting h... \n",
"17635 Clean interiors of tank cars or tank trucks, u... \n",
"17636 Lower gauge rods into tanks or read meters to ... \n",
"17637 Operate conveyors and equipment to transfer gr... \n",
"17638 Perform general warehouse activities, such as ... \n",
"\n",
" occupation_title \\\n",
"0 Chief Executives \n",
"1 Chief Executives \n",
"2 Chief Executives \n",
"3 Chief Executives \n",
"4 Chief Executives \n",
"... ... \n",
"17634 Tank Car, Truck, and Ship Loaders \n",
"17635 Tank Car, Truck, and Ship Loaders \n",
"17636 Tank Car, Truck, and Ship Loaders \n",
"17637 Tank Car, Truck, and Ship Loaders \n",
"17638 Tank Car, Truck, and Ship Loaders \n",
"\n",
" occupation_description \\\n",
"0 Determine and formulate policies and provide o... \n",
"1 Determine and formulate policies and provide o... \n",
"2 Determine and formulate policies and provide o... \n",
"3 Determine and formulate policies and provide o... \n",
"4 Determine and formulate policies and provide o... \n",
"... ... \n",
"17634 Load and unload chemicals and bulk solids, suc... \n",
"17635 Load and unload chemicals and bulk solids, suc... \n",
"17636 Load and unload chemicals and bulk solids, suc... \n",
"17637 Load and unload chemicals and bulk solids, suc... \n",
"17638 Load and unload chemicals and bulk solids, suc... \n",
"\n",
" frequency_category_1 frequency_category_2 frequency_category_3 \\\n",
"0 5.92 15.98 29.68 \n",
"1 1.42 14.44 27.31 \n",
"2 15.50 38.21 32.73 \n",
"3 3.03 17.33 20.30 \n",
"4 1.98 14.06 42.60 \n",
"... ... ... ... \n",
"17634 6.05 29.21 6.88 \n",
"17635 1.47 6.33 21.70 \n",
"17636 4.52 1.76 4.65 \n",
"17637 6.97 12.00 2.52 \n",
"17638 5.91 10.85 6.46 \n",
"\n",
" frequency_category_4 frequency_category_5 frequency_category_6 \\\n",
"0 21.18 19.71 4.91 \n",
"1 25.52 26.88 2.52 \n",
"2 5.15 5.25 0.19 \n",
"3 18.10 33.16 2.01 \n",
"4 21.24 13.18 6.24 \n",
"... ... ... ... \n",
"17634 13.95 27.65 7.93 \n",
"17635 25.69 32.35 12.47 \n",
"17636 17.81 37.42 23.31 \n",
"17637 5.90 35.48 22.08 \n",
"17638 14.46 34.14 16.39 \n",
"\n",
" frequency_category_7 importance_average relevance_average OCC_CODE \\\n",
"0 2.63 4.52 74.44 11-1011 \n",
"1 1.90 4.32 81.71 11-1011 \n",
"2 2.98 4.30 93.41 11-1011 \n",
"3 6.07 4.24 97.79 11-1011 \n",
"4 0.70 4.17 92.92 11-1011 \n",
"... ... ... ... ... \n",
"17634 8.34 4.08 64.04 53-7121 \n",
"17635 0.00 4.02 44.33 53-7121 \n",
"17636 10.55 3.88 65.00 53-7121 \n",
"17637 15.05 3.87 47.90 53-7121 \n",
"17638 11.78 3.53 47.84 53-7121 \n",
"\n",
" TOT_EMP H_MEAN A_MEAN \n",
"0 211230.0 124.47 258900 \n",
"1 211230.0 124.47 258900 \n",
"2 211230.0 124.47 258900 \n",
"3 211230.0 124.47 258900 \n",
"4 211230.0 124.47 258900 \n",
"... ... ... ... \n",
"17634 11400.0 29.1 60530 \n",
"17635 11400.0 29.1 60530 \n",
"17636 11400.0 29.1 60530 \n",
"17637 11400.0 29.1 60530 \n",
"17638 11400.0 29.1 60530 \n",
"\n",
"[17639 rows x 18 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
"df_enriched_trs['occ_code_join'] = df_enriched_trs['onetsoc_code'].str[:7]\n",
"df = pd.merge(\n",
" df_enriched_trs,\n",
" df_oesm_detailed,\n",
" left_on='occ_code_join',\n",
" right_on='OCC_CODE',\n",
" how='left'\n",
")\n",
"df = df.drop(columns=['occ_code_join'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9be7acb5-2374-4f61-bba3-13b0077c0bd2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Task: Identify, evaluate and recommend hardware or software technologies to achieve desired database performance.\n",
"Occupation Description: Design strategies for enterprise databases, data warehouse systems, and multidimensional networks. Set standards for database operations, programming, query processes, and security. Model, design, and construct large relational databases or data warehouses. Create and optimize data models for warehouse infrastructure and workflow. Integrate new systems with existing warehouse structure and refine system performance and functionality.\n",
"Occupation Title: Database Architects\n"
]
},
{
"data": {
"text/plain": [
"119976"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_merged = pd \\\n",
" .merge(left=df_enriched_trs, right=df_remote_status[['O*NET-SOC Code', 'Remote']], how='left', left_on='onetsoc_code', right_on='O*NET-SOC Code') \\\n",
" .drop(columns=['O*NET-SOC Code']) \\\n",
" .rename(columns={'Remote': 'remote'}) \\\n",
" .rename(columns=FREQUENCY_MAP) \\\n",
" .query('remote == \"remote\" and importance_average >= 3 and relevance_average > 50')\n",
"\n",
"row = df_merged.iloc[30000]\n",
"print('Task: ', row['task'])\n",
"print('Occupation Description: ', row['occupation_description'])\n",
"print('Occupation Title: ', row['occupation_title'])\n",
"\n",
"len(df_merged)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd9ac1c3-6d17-4764-8a2e-c84d4019bd9e",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"outputs": [],
"source": [
"# Cross-reference woth BLS OEWS\n",
"# It doesn't really make sens to have it per-task, we only need it per-occupation...\n",
"df_oesm_detailed = df_oesm[df_oesm['O_GROUP'] == 'detailed'][['OCC_CODE', 'TOT_EMP', 'H_MEAN', 'A_MEAN']].copy()\n",
"df_merged['occ_code_join'] = df_merged['onetsoc_code'].str[:7]\n",
"df_merged = pd.merge(\n",
" df_merged,\n",
" df_oesm_detailed,\n",
" left_on='occ_code_join',\n",
" right_on='OCC_CODE',\n",
" how='left'\n",
")\n",
"df_merged = df_merged.drop(columns=['occ_code_join'])\n",
"df_merged"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "08f45d91-039d-4ec0-94a2-f305a3312e6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Why did the scarecrow win an award?\n",
"\n",
"Because he was outstanding in his field!\n"
]
}
],
"source": [
"response = oai.chat.completions.create(messages=[{\"role\": \"user\", \"content\": \"Tell me a joke\"}], model=\"gpt-4.1-2025-04-14\", max_tokens=100, temperature=0.7, n=1, stop=None)\n",
"joke = response.choices[0].message.content.strip()\n",
"print(joke)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}