From 575ccac8b892c2c0896e180400fd52b91ae76e9f Mon Sep 17 00:00:00 2001
From: voigta80 <aiko.voigt@univie.ac.at>
Date: Tue, 29 Oct 2024 09:25:34 +0000
Subject: [PATCH] Adds dask distributed client for PV pot generation, also core
 functions added

---
 analysis/core.py                 |  39 ++++
 analysis/dask-testing.ipynb      | 312 +++++++++++++++++++++++++++++++
 analysis/exercise_20241022.ipynb |   2 +-
 analysis/exercise_20241029.ipynb | 198 ++++++++++++++++++++
 4 files changed, 550 insertions(+), 1 deletion(-)
 create mode 100644 analysis/core.py
 create mode 100644 analysis/dask-testing.ipynb
 create mode 100644 analysis/exercise_20241029.ipynb

diff --git a/analysis/core.py b/analysis/core.py
new file mode 100644
index 0000000..4d531e3
--- /dev/null
+++ b/analysis/core.py
@@ -0,0 +1,39 @@
+import numpy as np
+import xarray as xr
+import time
+
+def windspeed(_ds):
+    return np.sqrt(np.power(_ds["u10"],2)+np.power(_ds["v10"],2))
+
+def windspeed2(a, b):
+    func = lambda x, y: np.sqrt(x**2 + y**2)
+    return xr.apply_ufunc(func, a, b, dask="parallelized")
+
+def pv_pot(_ds):
+
+    sechour=3600 # seconds per hour
+    c1 = 4.3
+    c2 = 0.943
+    c3 = 0.028
+    c4 = -1.528
+
+    # cell temperature
+    T_cell = c1 + c2 * (_ds.t2m - 273.15) + c3 * _ds.ssrd/sechour + c4 * _ds.wspd
+
+    # performance ratio
+    beta = -0.005
+    p_r = 1 + beta*(T_cell-25)
+
+    # pv potential
+    pv_pot = p_r * _ds.ssrd/(sechour) * 1/1000
+
+    return pv_pot
+
+
+def measure_performance(code_to_run):
+    start_time = time.time()
+    # Run the code
+    code_to_run()
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Execution time: {execution_time:.5f} seconds")
\ No newline at end of file
diff --git a/analysis/dask-testing.ipynb b/analysis/dask-testing.ipynb
new file mode 100644
index 0000000..c84aaa7
--- /dev/null
+++ b/analysis/dask-testing.ipynb
@@ -0,0 +1,312 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6b0f6097-a216-43cf-be0b-2e23a325510d",
+   "metadata": {},
+   "source": [
+    "# Using dask distributed cluster to speed up computation of PV potential by factor of 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9355f147-7603-413d-8989-69201a37653f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xarray as xr\n",
+    "import numpy as np\n",
+    "import time\n",
+    "from dask.distributed import Client\n",
+    "import core as core\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "56840920-bb48-440f-b983-f9a71471b57f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# location of era5 data on teachinghub\n",
+    "path=\"/home/voigta80/LEHRE/msc-intro-comp-met-ex-w2024/data/era5/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8507960-3056-4721-8ec5-d167c3d3f955",
+   "metadata": {},
+   "source": [
+    "Start dask cluster with 10 processes (workers), each with 5 threads. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "61415c08-e2ed-4649-9371-0501c4cea567",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Client(n_workers=20, threads_per_worker=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "41ee4b25-2070-4ab0-936e-bf94c0d5a8c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
+       "    <div style=\"margin-left: 48px;\">\n",
+       "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-2c457eb7-95d7-11ef-9ac9-369f2043f720</p>\n",
+       "        <table style=\"width: 100%; text-align: left;\">\n",
+       "\n",
+       "        <tr>\n",
+       "        \n",
+       "            <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
+       "            <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
+       "        \n",
+       "        </tr>\n",
+       "\n",
+       "        \n",
+       "            <tr>\n",
+       "                <td style=\"text-align: left;\">\n",
+       "                    <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:42853/status\" target=\"_blank\">http://127.0.0.1:42853/status</a>\n",
+       "                </td>\n",
+       "                <td style=\"text-align: left;\"></td>\n",
+       "            </tr>\n",
+       "        \n",
+       "\n",
+       "        </table>\n",
+       "\n",
+       "        \n",
+       "            <details>\n",
+       "            <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
+       "            <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
+       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
+       "    </div>\n",
+       "    <div style=\"margin-left: 48px;\">\n",
+       "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
+       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">bf1a677b</p>\n",
+       "        <table style=\"width: 100%; text-align: left;\">\n",
+       "            <tr>\n",
+       "                <td style=\"text-align: left;\">\n",
+       "                    <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:42853/status\" target=\"_blank\">http://127.0.0.1:42853/status</a>\n",
+       "                </td>\n",
+       "                <td style=\"text-align: left;\">\n",
+       "                    <strong>Workers:</strong> 1\n",
+       "                </td>\n",
+       "            </tr>\n",
+       "            <tr>\n",
+       "                <td style=\"text-align: left;\">\n",
+       "                    <strong>Total threads:</strong> 5\n",
+       "                </td>\n",
+       "                <td style=\"text-align: left;\">\n",
+       "                    <strong>Total memory:</strong> 753.83 GiB\n",
+       "                </td>\n",
+       "            </tr>\n",
+       "            \n",
+       "            <tr>\n",
+       "    <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
+       "    <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
+       "</tr>\n",
+       "\n",
+       "            \n",
+       "        </table>\n",
+       "\n",
+       "        <details>\n",
+       "            <summary style=\"margin-bottom: 20px;\">\n",
+       "                <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
+       "            </summary>\n",
+       "\n",
+       "            <div style=\"\">\n",
+       "    <div>\n",
+       "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
+       "        <div style=\"margin-left: 48px;\">\n",
+       "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
+       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-4dc50ccd-fcd6-4939-be94-079b8855ef99</p>\n",
+       "            <table style=\"width: 100%; text-align: left;\">\n",
+       "                <tr>\n",
+       "                    <td style=\"text-align: left;\">\n",
+       "                        <strong>Comm:</strong> tcp://127.0.0.1:38787\n",
+       "                    </td>\n",
+       "                    <td style=\"text-align: left;\">\n",
+       "                        <strong>Workers:</strong> 1\n",
+       "                    </td>\n",
+       "                </tr>\n",
+       "                <tr>\n",
+       "                    <td style=\"text-align: left;\">\n",
+       "                        <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:42853/status\" target=\"_blank\">http://127.0.0.1:42853/status</a>\n",
+       "                    </td>\n",
+       "                    <td style=\"text-align: left;\">\n",
+       "                        <strong>Total threads:</strong> 5\n",
+       "                    </td>\n",
+       "                </tr>\n",
+       "                <tr>\n",
+       "                    <td style=\"text-align: left;\">\n",
+       "                        <strong>Started:</strong> Just now\n",
+       "                    </td>\n",
+       "                    <td style=\"text-align: left;\">\n",
+       "                        <strong>Total memory:</strong> 753.83 GiB\n",
+       "                    </td>\n",
+       "                </tr>\n",
+       "            </table>\n",
+       "        </div>\n",
+       "    </div>\n",
+       "\n",
+       "    <details style=\"margin-left: 48px;\">\n",
+       "        <summary style=\"margin-bottom: 20px;\">\n",
+       "            <h3 style=\"display: inline;\">Workers</h3>\n",
+       "        </summary>\n",
+       "\n",
+       "        \n",
+       "        <div style=\"margin-bottom: 20px;\">\n",
+       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
+       "            <div style=\"margin-left: 48px;\">\n",
+       "            <details>\n",
+       "                <summary>\n",
+       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
+       "                </summary>\n",
+       "                <table style=\"width: 100%; text-align: left;\">\n",
+       "                    <tr>\n",
+       "                        <td style=\"text-align: left;\">\n",
+       "                            <strong>Comm: </strong> tcp://127.0.0.1:34679\n",
+       "                        </td>\n",
+       "                        <td style=\"text-align: left;\">\n",
+       "                            <strong>Total threads: </strong> 5\n",
+       "                        </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <td style=\"text-align: left;\">\n",
+       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37085/status\" target=\"_blank\">http://127.0.0.1:37085/status</a>\n",
+       "                        </td>\n",
+       "                        <td style=\"text-align: left;\">\n",
+       "                            <strong>Memory: </strong> 753.83 GiB\n",
+       "                        </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <td style=\"text-align: left;\">\n",
+       "                            <strong>Nanny: </strong> tcp://127.0.0.1:35941\n",
+       "                        </td>\n",
+       "                        <td style=\"text-align: left;\"></td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
+       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-1b2oh5pr\n",
+       "                        </td>\n",
+       "                    </tr>\n",
+       "\n",
+       "                    \n",
+       "\n",
+       "                    \n",
+       "\n",
+       "                </table>\n",
+       "            </details>\n",
+       "            </div>\n",
+       "        </div>\n",
+       "        \n",
+       "\n",
+       "    </details>\n",
+       "</div>\n",
+       "\n",
+       "        </details>\n",
+       "    </div>\n",
+       "</div>\n",
+       "            </details>\n",
+       "        \n",
+       "\n",
+       "    </div>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "<Client: 'tcp://127.0.0.1:38787' processes=1 threads=5, memory=753.83 GiB>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bea8c4c-2b36-4d07-9689-d69c07b1af53",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "start_time = time.time()\n",
+    "ds=xr.open_mfdataset(path+\"era5-2000-*.nc\", engine=\"netcdf4\", chunks={\"valid_time\":1e5} )\n",
+    "ds[\"wspd\"] = core.windspeed(ds)\n",
+    "pvpot = core.pv_pot(ds).groupby(ds.valid_time.dt.month).mean(\"valid_time\").compute()\n",
+    "end_time = time.time()\n",
+    "execution_time = end_time - start_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f637f592-f58b-46ad-885a-80670c8ba292",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Execution time: {execution_time:.5f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70964d66-66a4-44bd-bf54-e64ef6876e67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a375ca41-8af0-46e6-aa25-80241b9ab8f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pvpot"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MagicPy",
+   "language": "python",
+   "name": "magicpy"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/analysis/exercise_20241022.ipynb b/analysis/exercise_20241022.ipynb
index 2379132..e497cda 100644
--- a/analysis/exercise_20241022.ipynb
+++ b/analysis/exercise_20241022.ipynb
@@ -84,7 +84,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sechour=3600 # secondd per hour\n",
+    "sechour=3600 # seconds per hour\n",
     "\n",
     "c1 = 4.3\n",
     "c2 = 0.943\n",
diff --git a/analysis/exercise_20241029.ipynb b/analysis/exercise_20241029.ipynb
new file mode 100644
index 0000000..146e8ad
--- /dev/null
+++ b/analysis/exercise_20241029.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7d41920d-ccc6-4657-bacc-19006b87d7b8",
+   "metadata": {},
+   "source": [
+    "# Example calculation of PV potential for ERA5 data\n",
+    "\n",
+    "I calculate for one year and derive monthly-mean values of PV potential. I compare two manners of parallelizing the work:\n",
+    "\n",
+    "1. multiprocessing with 1 process per file, hence 12 processes; each process uses dask-xarray under the hood and can hence use >100% CPU\n",
+    "2. dask-xarray processing of 1 dataset with all 12 months"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "99875287-2b84-460f-9c90-2e2542ff4e9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xarray as xr\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "import matplotlib.pyplot as plt\n",
+    "from multiprocessing import Process, Queue\n",
+    "\n",
+    "import core as core\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1f4067af-2ee7-43f5-b43d-94739e229f5c",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# location of era5 data on teachinghub\n",
+    "path=\"/home/voigta80/LEHRE/msc-intro-comp-met-ex-w2024/data/era5/\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e447b6c3-da76-4bf2-a417-2d957389ab73",
+   "metadata": {},
+   "source": [
+    "## 1. Multiprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "973862f1-8afc-417c-a009-0864322bb95b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate list of era5 files for a given year\n",
+    "def get_filelists(year: str):\n",
+    "    flist = list()\n",
+    "    for file in Path(path).rglob(\"era5-\"+year+\"-*.nc\"):\n",
+    "        flist.append(file)\n",
+    "    return flist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "83ae56d9-2577-4fee-9f2c-5ef80da798ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# function to compute time-mean pv potential, will be called by multiprocessing\n",
+    "def batchcompute_pvpot(file, queue):\n",
+    "    ds = xr.open_dataset(file, engine=\"netcdf4\", chunks={\"valid_time\":1e5} )\n",
+    "    ds[\"wspd\"] = core.windspeed(ds)\n",
+    "    pv_pot = core.pv_pot(ds).mean(\"valid_time\").compute()\n",
+    "    queue.put(pv_pot)\n",
+    "    return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8456f917-6cad-4fd7-940b-d083bf15ac78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nlat = 721;\n",
+    "nlon = 1440;\n",
+    "\n",
+    "def multi_processing():\n",
+    "    year= \"2000\"\n",
+    "    flist = get_filelists(year)\n",
+    "    # use 1 process per monthly file\n",
+    "    nprocs = len(flist)\n",
+    "    # output from each process\n",
+    "    pvpot_chk = np.zeros((nprocs,nlat,nlon))\n",
+    "    queue = Queue()\n",
+    "    processes = [Process(target=batchcompute_pvpot, \n",
+    "                         args=(flist[i], queue)) for i in range(0, nprocs)]\n",
+    "    for process in processes: process.start() # start all processes\n",
+    "    for i in range(nprocs): # collect results from processes\n",
+    "        pvpot_chk[i] = queue.get()\n",
+    "    for process in processes: process.join()  # wait for all processes to complete\n",
+    "    # merge into yearly array\n",
+    "    pvpot = np.stack(pvpot_chk, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a2b43143-a8d2-4691-a7c9-aea6673cc231",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Execution time: 78.37880 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "core.measure_performance(multi_processing)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee433aec-223e-445c-9040-70e86d1118e1",
+   "metadata": {},
+   "source": [
+    "## 2. Dask-array on merged file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "daf3daff-d39d-4a49-8c13-3ffba150a8b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def dask_xarray():\n",
+    "    ds2=xr.open_mfdataset(path+\"era5-2000-*.nc\", chunks={\"valid_time\":1e5} )\n",
+    "    ds2[\"wspd\"] = core.windspeed(ds2)\n",
+    "    pvpot2 = core.pv_pot(ds2).groupby(ds2.valid_time.dt.month).mean(\"valid_time\").compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0be1428c-aeca-4678-82df-81666b2df90e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Execution time: 679.63075 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "core.measure_performance(dask_xarray)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MagicPy",
+   "language": "python",
+   "name": "magicpy"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab