{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Parallel Processing of Feature Detection with `dask`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook demonstrates how to run *tobac* feature detection in parallel using the `dask` library as the parallel processor." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports and Dask Cluster Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:34.154257Z", "iopub.status.busy": "2026-02-02T20:13:34.153994Z", "iopub.status.idle": "2026-02-02T20:13:34.403115Z", "shell.execute_reply": "2026-02-02T20:13:34.402555Z" } }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:34.405033Z", "iopub.status.busy": "2026-02-02T20:13:34.404865Z", "iopub.status.idle": "2026-02-02T20:13:36.607735Z", "shell.execute_reply": "2026-02-02T20:13:36.606988Z" } }, "outputs": [], "source": [ "import tobac\n", "import dask.bag as db\n", "import xarray as xr\n", "import s3fs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are many different ways to initialize a dask cluster. This is just one example, running two workers on a single local machine. " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:36.609710Z", "iopub.status.busy": "2026-02-02T20:13:36.609410Z", "iopub.status.idle": "2026-02-02T20:13:37.614715Z", "shell.execute_reply": "2026-02-02T20:13:37.614166Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "
\n", "
\n", "

Client

\n", "

Client-a81ef1fe-0073-11f1-88af-5ce91e89f225

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", " Dashboard: http://127.0.0.1:8787/status\n", "
\n", "\n", " \n", "\n", " \n", "
\n", "

Cluster Info

\n", "
\n", "
\n", "
\n", "
\n", "

LocalCluster

\n", "

47c93e83

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", " \n", "
\n", " Dashboard: http://127.0.0.1:8787/status\n", " \n", " Workers: 2\n", "
\n", " Total threads: 2\n", " \n", " Total memory: 64.00 GiB\n", "
Status: runningUsing processes: True
\n", "\n", "
\n", " \n", "

Scheduler Info

\n", "
\n", "\n", "
\n", "
\n", "
\n", "
\n", "

Scheduler

\n", "

Scheduler-97d48457-ae1a-4c58-ae75-b2ce78a333fb

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", " Comm: tcp://127.0.0.1:58618\n", " \n", " Workers: 0 \n", "
\n", " Dashboard: http://127.0.0.1:8787/status\n", " \n", " Total threads: 0\n", "
\n", " Started: Just now\n", " \n", " Total memory: 0 B\n", "
\n", "
\n", "
\n", "\n", "
\n", " \n", "

Workers

\n", "
\n", "\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "

Worker: 0

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "\n", "
\n", " Comm: tcp://127.0.0.1:58634\n", " \n", " Total threads: 1\n", "
\n", " Dashboard: http://127.0.0.1:58635/status\n", " \n", " Memory: 32.00 GiB\n", "
\n", " Nanny: tcp://127.0.0.1:58621\n", "
\n", " Local directory: /var/folders/bj/m6g82c6n41g83y3_dx02y7ch0000gp/T/dask-scratch-space/worker-xpiuck_r\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "

Worker: 1

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "\n", "
\n", " Comm: tcp://127.0.0.1:58633\n", " \n", " Total threads: 1\n", "
\n", " Dashboard: http://127.0.0.1:58636/status\n", " \n", " Memory: 32.00 GiB\n", "
\n", " Nanny: tcp://127.0.0.1:58623\n", "
\n", " Local directory: /var/folders/bj/m6g82c6n41g83y3_dx02y7ch0000gp/T/dask-scratch-space/worker-zn2bx8bj\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", "
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dask.distributed import Client, progress\n", "\n", "client = Client(n_workers=2, threads_per_worker=1)\n", "client" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read in Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here, we are using the NOAA Global Mosaic of Geostationary Satellite Imagery (GMGSI) as our input data source from AWS s3." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:37.635188Z", "iopub.status.busy": "2026-02-02T20:13:37.634999Z", "iopub.status.idle": "2026-02-02T20:13:39.522443Z", "shell.execute_reply": "2026-02-02T20:13:39.521886Z" } }, "outputs": [], "source": [ "fs = s3fs.S3FileSystem(anon=True)\n", "aws_urls = [\n", " \"s3://noaa-gmgsi-pds/GMGSI_LW/2024/01/01/00/GLOBCOMPLIR_nc.2024010100\",\n", " \"s3://noaa-gmgsi-pds/GMGSI_LW/2024/01/01/01/GLOBCOMPLIR_nc.2024010101\",\n", "]\n", "\n", "all_ds = list()\n", "for aws_url in aws_urls:\n", " fileObj = fs.open(aws_url)\n", " all_ds.append(xr.open_dataset(fileObj, engine=\"h5netcdf\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We loaded in two files and we will use xarray to concatenate them." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:39.525121Z", "iopub.status.busy": "2026-02-02T20:13:39.524687Z", "iopub.status.idle": "2026-02-02T20:13:40.569148Z", "shell.execute_reply": "2026-02-02T20:13:40.568277Z" } }, "outputs": [], "source": [ "combined_ds = xr.concat(all_ds, dim=\"time\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:40.571844Z", "iopub.status.busy": "2026-02-02T20:13:40.571652Z", "iopub.status.idle": "2026-02-02T20:13:40.580241Z", "shell.execute_reply": "2026-02-02T20:13:40.579686Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset> Size: 240MB\n",
       "Dimensions:  (time: 2, yc: 3000, xc: 4999)\n",
       "Coordinates:\n",
       "  * time     (time) datetime64[ns] 16B 2024-01-01 2024-01-01T01:00:00\n",
       "    lat      (yc, xc) float32 60MB 72.72 72.72 72.72 ... -72.74 -72.74 -72.74\n",
       "    lon      (yc, xc) float32 60MB 180.0 -179.9 -179.9 ... 179.8 179.8 179.9\n",
       "Dimensions without coordinates: yc, xc\n",
       "Data variables:\n",
       "    data     (time, yc, xc) float32 120MB 206.0 204.0 204.0 ... 188.0 182.0\n",
       "Attributes:\n",
       "    Conventions:          CF-1.4\n",
       "    Source:               McIDAS Area File\n",
       "    Satellite Sensor:     DERIVED DATA\n",
       "    time_coverage_start:  2024-01-01T00:00:00\n",
       "    instrument_name:      GLOBCOMPLIR\n",
       "    history:              Mon Jan  1 00:38:21 2024: ncks -d xc,0,4998 templir...\n",
       "    NCO:                  netCDF Operators version 4.7.5 (Homepage = http://n...
" ], "text/plain": [ " Size: 240MB\n", "Dimensions: (time: 2, yc: 3000, xc: 4999)\n", "Coordinates:\n", " * time (time) datetime64[ns] 16B 2024-01-01 2024-01-01T01:00:00\n", " lat (yc, xc) float32 60MB 72.72 72.72 72.72 ... -72.74 -72.74 -72.74\n", " lon (yc, xc) float32 60MB 180.0 -179.9 -179.9 ... 179.8 179.8 179.9\n", "Dimensions without coordinates: yc, xc\n", "Data variables:\n", " data (time, yc, xc) float32 120MB 206.0 204.0 204.0 ... 188.0 182.0\n", "Attributes:\n", " Conventions: CF-1.4\n", " Source: McIDAS Area File\n", " Satellite Sensor: DERIVED DATA\n", " time_coverage_start: 2024-01-01T00:00:00\n", " instrument_name: GLOBCOMPLIR\n", " history: Mon Jan 1 00:38:21 2024: ncks -d xc,0,4998 templir...\n", " NCO: netCDF Operators version 4.7.5 (Homepage = http://n..." ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_ds" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "These feature detection parameters are just examples." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## *tobac* Feature Detection" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:40.581827Z", "iopub.status.busy": "2026-02-02T20:13:40.581715Z", "iopub.status.idle": "2026-02-02T20:13:40.583678Z", "shell.execute_reply": "2026-02-02T20:13:40.583310Z" } }, "outputs": [], "source": [ "parameters_features = {}\n", "parameters_features[\"position_threshold\"] = \"weighted_diff\"\n", "parameters_features[\"sigma_threshold\"] = 0.5\n", "parameters_features[\"n_min_threshold\"] = 4\n", "parameters_features[\"target\"] = \"minimum\"\n", "parameters_features[\"threshold\"] = [180, 170]\n", "parameters_features[\"PBC_flag\"] = \"hdim_2\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "While future versions (1.6 and greater) of *tobac* will support xarray natively in feature detection and segmentation, current versions of *tobac* rely on Iris for gridded data. Because of this, we have to make some conversions to have this data be compatible with iris. " ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:40.585219Z", "iopub.status.busy": "2026-02-02T20:13:40.585072Z", "iopub.status.idle": "2026-02-02T20:13:40.587108Z", "shell.execute_reply": "2026-02-02T20:13:40.586634Z" } }, "outputs": [], "source": [ "# iris issues\n", "combined_ds[\"data\"].attrs[\"units\"] = \"kelvin\"\n", "combined_ds[\"data\"][\"time\"].attrs[\"long_name\"] = \"time\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we will use a *dask bag* to parallelize our feature detection over time. " ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:40.588538Z", "iopub.status.busy": "2026-02-02T20:13:40.588441Z", "iopub.status.idle": "2026-02-02T20:13:48.987353Z", "shell.execute_reply": "2026-02-02T20:13:48.986834Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/seanfreeman/mambaforge/envs/tobac_dev_3_13/lib/python3.13/site-packages/distributed/client.py:3374: UserWarning: Sending large graph of size 57.21 MiB.\n", "This may cause some slowdown.\n", "Consider loading the data with Dask directly\n", " or using futures or delayed objects to embed the data into the graph without repetition.\n", "See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.\n", " warnings.warn(\n" ] } ], "source": [ "b = db.from_sequence(\n", " [\n", " combined_ds[\"data\"][x : x + 1][0:500, 0:500]\n", " for x in range(len(combined_ds[\"time\"]))\n", " ],\n", " npartitions=1,\n", ")\n", "out_feature_dfs = db.map(\n", " lambda x: tobac.feature_detection_multithreshold(\n", " x, 4000, **parameters_features\n", " ),\n", " b,\n", ").compute()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combining parallel-detected features into one coherent DataFrame" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2026-02-02T20:13:48.989469Z", "iopub.status.busy": "2026-02-02T20:13:48.988794Z", "iopub.status.idle": "2026-02-02T20:13:48.998916Z", "shell.execute_reply": "2026-02-02T20:13:48.998505Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
frameidxhdim_1hdim_2numthreshold_valuefeaturetimetimestrlatlon
0010.97532560.7181322618012024-01-01 00:00:002024-01-01 00:00:0072.694528-175.628134
1020.67001879.0748181618022024-01-01 00:00:002024-01-01 00:00:0072.701065-174.306289
2050.3581151492.9797781518032024-01-01 00:00:002024-01-01 00:00:0072.707742-72.492469
3060.4825791531.5202152618042024-01-01 00:00:002024-01-01 00:00:0072.705077-69.717214
4073.4098962113.18577028518052024-01-01 00:00:002024-01-01 00:00:0072.642292-27.832080
....................................
156511283498.18090630.0050311617015662024-01-01 01:00:002024-01-01 01:00:0058.262049-177.839756
156611284497.817803715.879216717015672024-01-01 01:00:002024-01-01 01:00:0058.275803-128.450668
156711285498.1582323295.1861716317015682024-01-01 01:00:002024-01-01 01:00:0058.26290857.282537
156811288498.4544043793.095530517015692024-01-01 01:00:002024-01-01 01:00:0058.25168293.136465
156911293498.6489113317.641704617015702024-01-01 01:00:002024-01-01 01:00:0058.24431058.899535
\n", "

1570 rows × 11 columns

\n", "
" ], "text/plain": [ " frame idx hdim_1 hdim_2 num threshold_value feature \\\n", "0 0 1 0.975325 60.718132 26 180 1 \n", "1 0 2 0.670018 79.074818 16 180 2 \n", "2 0 5 0.358115 1492.979778 15 180 3 \n", "3 0 6 0.482579 1531.520215 26 180 4 \n", "4 0 7 3.409896 2113.185770 285 180 5 \n", "... ... ... ... ... ... ... ... \n", "1565 1 1283 498.180906 30.005031 16 170 1566 \n", "1566 1 1284 497.817803 715.879216 7 170 1567 \n", "1567 1 1285 498.158232 3295.186171 63 170 1568 \n", "1568 1 1288 498.454404 3793.095530 5 170 1569 \n", "1569 1 1293 498.648911 3317.641704 6 170 1570 \n", "\n", " time timestr lat lon \n", "0 2024-01-01 00:00:00 2024-01-01 00:00:00 72.694528 -175.628134 \n", "1 2024-01-01 00:00:00 2024-01-01 00:00:00 72.701065 -174.306289 \n", "2 2024-01-01 00:00:00 2024-01-01 00:00:00 72.707742 -72.492469 \n", "3 2024-01-01 00:00:00 2024-01-01 00:00:00 72.705077 -69.717214 \n", "4 2024-01-01 00:00:00 2024-01-01 00:00:00 72.642292 -27.832080 \n", "... ... ... ... ... \n", "1565 2024-01-01 01:00:00 2024-01-01 01:00:00 58.262049 -177.839756 \n", "1566 2024-01-01 01:00:00 2024-01-01 01:00:00 58.275803 -128.450668 \n", "1567 2024-01-01 01:00:00 2024-01-01 01:00:00 58.262908 57.282537 \n", "1568 2024-01-01 01:00:00 2024-01-01 01:00:00 58.251682 93.136465 \n", "1569 2024-01-01 01:00:00 2024-01-01 01:00:00 58.244310 58.899535 \n", "\n", "[1570 rows x 11 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tobac.utils.general.combine_feature_dataframes(out_feature_dfs)" ] } ], "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.11" } }, "nbformat": 4, "nbformat_minor": 4 }