From 575ccac8b892c2c0896e180400fd52b91ae76e9f Mon Sep 17 00:00:00 2001 From: voigta80 <aiko.voigt@univie.ac.at> Date: Tue, 29 Oct 2024 09:25:34 +0000 Subject: [PATCH] Adds dask distributed client for PV pot generation, also core functions added --- analysis/core.py | 39 ++++ analysis/dask-testing.ipynb | 312 +++++++++++++++++++++++++++++++ analysis/exercise_20241022.ipynb | 2 +- analysis/exercise_20241029.ipynb | 198 ++++++++++++++++++++ 4 files changed, 550 insertions(+), 1 deletion(-) create mode 100644 analysis/core.py create mode 100644 analysis/dask-testing.ipynb create mode 100644 analysis/exercise_20241029.ipynb diff --git a/analysis/core.py b/analysis/core.py new file mode 100644 index 0000000..4d531e3 --- /dev/null +++ b/analysis/core.py @@ -0,0 +1,39 @@ +import numpy as np +import xarray as xr +import time + +def windspeed(_ds): + return np.sqrt(np.power(_ds["u10"],2)+np.power(_ds["v10"],2)) + +def windspeed2(a, b): + func = lambda x, y: np.sqrt(x**2 + y**2) + return xr.apply_ufunc(func, a, b, dask="parallelized") + +def pv_pot(_ds): + + sechour=3600 # seconds per hour + c1 = 4.3 + c2 = 0.943 + c3 = 0.028 + c4 = -1.528 + + # cell temperature + T_cell = c1 + c2 * (_ds.t2m - 273.15) + c3 * _ds.ssrd/sechour + c4 * _ds.wspd + + # performance ratio + beta = -0.005 + p_r = 1 + beta*(T_cell-25) + + # pv potential + pv_pot = p_r * _ds.ssrd/(sechour) * 1/1000 + + return pv_pot + + +def measure_performance(code_to_run): + start_time = time.time() + # Run the code + code_to_run() + end_time = time.time() + execution_time = end_time - start_time + print(f"Execution time: {execution_time:.5f} seconds") \ No newline at end of file diff --git a/analysis/dask-testing.ipynb b/analysis/dask-testing.ipynb new file mode 100644 index 0000000..c84aaa7 --- /dev/null +++ b/analysis/dask-testing.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6b0f6097-a216-43cf-be0b-2e23a325510d", + "metadata": {}, + "source": [ + "# Using dask distributed cluster to speed up computation of PV potential by factor of 10" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9355f147-7603-413d-8989-69201a37653f", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import numpy as np\n", + "import time\n", + "from dask.distributed import Client\n", + "import core as core\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "56840920-bb48-440f-b983-f9a71471b57f", + "metadata": {}, + "outputs": [], + "source": [ + "# location of era5 data on teachinghub\n", + "path=\"/home/voigta80/LEHRE/msc-intro-comp-met-ex-w2024/data/era5/\"" + ] + }, + { + "cell_type": "markdown", + "id": "a8507960-3056-4721-8ec5-d167c3d3f955", + "metadata": {}, + "source": [ + "Start dask cluster with 10 processes (workers), each with 5 threads. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "61415c08-e2ed-4649-9371-0501c4cea567", + "metadata": {}, + "outputs": [], + "source": [ + "client = Client(n_workers=20, threads_per_worker=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "41ee4b25-2070-4ab0-936e-bf94c0d5a8c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + " <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n", + " <div style=\"margin-left: 48px;\">\n", + " <h3 style=\"margin-bottom: 0px;\">Client</h3>\n", + " <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-2c457eb7-95d7-11ef-9ac9-369f2043f720</p>\n", + " <table style=\"width: 100%; text-align: left;\">\n", + "\n", + " <tr>\n", + " \n", + " <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n", + " <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n", + " \n", + " </tr>\n", + "\n", + " \n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:42853/status\" target=\"_blank\">http://127.0.0.1:42853/status</a>\n", + " </td>\n", + " <td style=\"text-align: left;\"></td>\n", + " </tr>\n", + " \n", + "\n", + " </table>\n", + "\n", + " \n", + " <details>\n", + " <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n", + " <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n", + " <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n", + " </div>\n", + " <div style=\"margin-left: 48px;\">\n", + " <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n", + " <p style=\"color: #9D9D9D; margin-bottom: 0px;\">bf1a677b</p>\n", + " <table style=\"width: 100%; text-align: left;\">\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:42853/status\" target=\"_blank\">http://127.0.0.1:42853/status</a>\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Workers:</strong> 1\n", + " </td>\n", + " </tr>\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Total threads:</strong> 5\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Total memory:</strong> 753.83 GiB\n", + " </td>\n", + " </tr>\n", + " \n", + " <tr>\n", + " <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n", + " <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n", + "</tr>\n", + "\n", + " \n", + " </table>\n", + "\n", + " <details>\n", + " <summary style=\"margin-bottom: 20px;\">\n", + " <h3 style=\"display: inline;\">Scheduler Info</h3>\n", + " </summary>\n", + "\n", + " <div style=\"\">\n", + " <div>\n", + " <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n", + " <div style=\"margin-left: 48px;\">\n", + " <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n", + " <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-4dc50ccd-fcd6-4939-be94-079b8855ef99</p>\n", + " <table style=\"width: 100%; text-align: left;\">\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Comm:</strong> tcp://127.0.0.1:38787\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Workers:</strong> 1\n", + " </td>\n", + " </tr>\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:42853/status\" target=\"_blank\">http://127.0.0.1:42853/status</a>\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Total threads:</strong> 5\n", + " </td>\n", + " </tr>\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Started:</strong> Just now\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Total memory:</strong> 753.83 GiB\n", + " </td>\n", + " </tr>\n", + " </table>\n", + " </div>\n", + " </div>\n", + "\n", + " <details style=\"margin-left: 48px;\">\n", + " <summary style=\"margin-bottom: 20px;\">\n", + " <h3 style=\"display: inline;\">Workers</h3>\n", + " </summary>\n", + "\n", + " \n", + " <div style=\"margin-bottom: 20px;\">\n", + " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", + " <div style=\"margin-left: 48px;\">\n", + " <details>\n", + " <summary>\n", + " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n", + " </summary>\n", + " <table style=\"width: 100%; text-align: left;\">\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Comm: </strong> tcp://127.0.0.1:34679\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Total threads: </strong> 5\n", + " </td>\n", + " </tr>\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37085/status\" target=\"_blank\">http://127.0.0.1:37085/status</a>\n", + " </td>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Memory: </strong> 753.83 GiB\n", + " </td>\n", + " </tr>\n", + " <tr>\n", + " <td style=\"text-align: left;\">\n", + " <strong>Nanny: </strong> tcp://127.0.0.1:35941\n", + " </td>\n", + " <td style=\"text-align: left;\"></td>\n", + " </tr>\n", + " <tr>\n", + " <td colspan=\"2\" style=\"text-align: left;\">\n", + " <strong>Local directory: </strong> /tmp/dask-worker-space/worker-1b2oh5pr\n", + " </td>\n", + " </tr>\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " </table>\n", + " </details>\n", + " </div>\n", + " </div>\n", + " \n", + "\n", + " </details>\n", + "</div>\n", + "\n", + " </details>\n", + " </div>\n", + "</div>\n", + " </details>\n", + " \n", + "\n", + " </div>\n", + "</div>" + ], + "text/plain": [ + "<Client: 'tcp://127.0.0.1:38787' processes=1 threads=5, memory=753.83 GiB>" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bea8c4c-2b36-4d07-9689-d69c07b1af53", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "start_time = time.time()\n", + "ds=xr.open_mfdataset(path+\"era5-2000-*.nc\", engine=\"netcdf4\", chunks={\"valid_time\":1e5} )\n", + "ds[\"wspd\"] = core.windspeed(ds)\n", + "pvpot = core.pv_pot(ds).groupby(ds.valid_time.dt.month).mean(\"valid_time\").compute()\n", + "end_time = time.time()\n", + "execution_time = end_time - start_time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f637f592-f58b-46ad-885a-80670c8ba292", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Execution time: {execution_time:.5f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70964d66-66a4-44bd-bf54-e64ef6876e67", + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a375ca41-8af0-46e6-aa25-80241b9ab8f1", + "metadata": {}, + "outputs": [], + "source": [ + "pvpot" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MagicPy", + "language": "python", + "name": "magicpy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/exercise_20241022.ipynb b/analysis/exercise_20241022.ipynb index 2379132..e497cda 100644 --- a/analysis/exercise_20241022.ipynb +++ b/analysis/exercise_20241022.ipynb @@ -84,7 +84,7 @@ "metadata": {}, "outputs": [], "source": [ - "sechour=3600 # secondd per hour\n", + "sechour=3600 # seconds per hour\n", "\n", "c1 = 4.3\n", "c2 = 0.943\n", diff --git a/analysis/exercise_20241029.ipynb b/analysis/exercise_20241029.ipynb new file mode 100644 index 0000000..146e8ad --- /dev/null +++ b/analysis/exercise_20241029.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d41920d-ccc6-4657-bacc-19006b87d7b8", + "metadata": {}, + "source": [ + "# Example calculation of PV potential for ERA5 data\n", + "\n", + "I calculate for one year and derive monthly-mean values of PV potential. I compare two manners of parallelizing the work:\n", + "\n", + "1. multiprocessing with 1 process per file, hence 12 processes; each process uses dask-xarray under the hood and can hence use >100% CPU\n", + "2. dask-xarray processing of 1 dataset with all 12 months" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "99875287-2b84-460f-9c90-2e2542ff4e9b", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "import numpy as np\n", + "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "from multiprocessing import Process, Queue\n", + "\n", + "import core as core\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1f4067af-2ee7-43f5-b43d-94739e229f5c", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# location of era5 data on teachinghub\n", + "path=\"/home/voigta80/LEHRE/msc-intro-comp-met-ex-w2024/data/era5/\"" + ] + }, + { + "cell_type": "markdown", + "id": "e447b6c3-da76-4bf2-a417-2d957389ab73", + "metadata": {}, + "source": [ + "## 1. Multiprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "973862f1-8afc-417c-a009-0864322bb95b", + "metadata": {}, + "outputs": [], + "source": [ + "# generate list of era5 files for a given year\n", + "def get_filelists(year: str):\n", + " flist = list()\n", + " for file in Path(path).rglob(\"era5-\"+year+\"-*.nc\"):\n", + " flist.append(file)\n", + " return flist" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "83ae56d9-2577-4fee-9f2c-5ef80da798ec", + "metadata": {}, + "outputs": [], + "source": [ + "# function to compute time-mean pv potential, will be called by multiprocessing\n", + "def batchcompute_pvpot(file, queue):\n", + " ds = xr.open_dataset(file, engine=\"netcdf4\", chunks={\"valid_time\":1e5} )\n", + " ds[\"wspd\"] = core.windspeed(ds)\n", + " pv_pot = core.pv_pot(ds).mean(\"valid_time\").compute()\n", + " queue.put(pv_pot)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8456f917-6cad-4fd7-940b-d083bf15ac78", + "metadata": {}, + "outputs": [], + "source": [ + "nlat = 721;\n", + "nlon = 1440;\n", + "\n", + "def multi_processing():\n", + " year= \"2000\"\n", + " flist = get_filelists(year)\n", + " # use 1 process per monthly file\n", + " nprocs = len(flist)\n", + " # output from each process\n", + " pvpot_chk = np.zeros((nprocs,nlat,nlon))\n", + " queue = Queue()\n", + " processes = [Process(target=batchcompute_pvpot, \n", + " args=(flist[i], queue)) for i in range(0, nprocs)]\n", + " for process in processes: process.start() # start all processes\n", + " for i in range(nprocs): # collect results from processes\n", + " pvpot_chk[i] = queue.get()\n", + " for process in processes: process.join() # wait for all processes to complete\n", + " # merge into yearly array\n", + " pvpot = np.stack(pvpot_chk, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a2b43143-a8d2-4691-a7c9-aea6673cc231", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Execution time: 78.37880 seconds\n" + ] + } + ], + "source": [ + "core.measure_performance(multi_processing)" + ] + }, + { + "cell_type": "markdown", + "id": "ee433aec-223e-445c-9040-70e86d1118e1", + "metadata": {}, + "source": [ + "## 2. Dask-array on merged file" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "daf3daff-d39d-4a49-8c13-3ffba150a8b4", + "metadata": {}, + "outputs": [], + "source": [ + "def dask_xarray():\n", + " ds2=xr.open_mfdataset(path+\"era5-2000-*.nc\", chunks={\"valid_time\":1e5} )\n", + " ds2[\"wspd\"] = core.windspeed(ds2)\n", + " pvpot2 = core.pv_pot(ds2).groupby(ds2.valid_time.dt.month).mean(\"valid_time\").compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0be1428c-aeca-4678-82df-81666b2df90e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Execution time: 679.63075 seconds\n" + ] + } + ], + "source": [ + "core.measure_performance(dask_xarray)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MagicPy", + "language": "python", + "name": "magicpy" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab