{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "tobac example: Compute bulk statistics as a postprocessing step\n",
    "=== \n",
    "\n",
    "Instead of during the feature detection or segmentation process, you can also calculate bulk statistics of your detected/tracked objects as a postprocessing step, i.e. based on a segmentation mask. This makes it possible to combine different datasets and derive statistics for your detected features based on other input fields (e.g., get precipitation statistics under cloud features that were segmented based on brightness temperatures or outgoing longwave radiation). \n",
    "\n",
    "This notebook shows an example for how to compute bulk statistics for detected features as a postprocessing step, that is based on the segmentation mask that we have already created. We perform the feature detection and segmentation with data from [our example for precipitation tracking](https://github.com/tobac-project/tobac/blob/main/examples/Example_Precip_Tracking/Example_Precip_Tracking.ipynb). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:20.915626Z",
     "iopub.status.busy": "2025-12-18T15:02:20.915507Z",
     "iopub.status.idle": "2025-12-18T15:02:21.733403Z",
     "shell.execute_reply": "2025-12-18T15:02:21.733031Z"
    }
   },
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xarray as xr \n",
    "import matplotlib.pyplot as plt\n",
    "import datetime\n",
    "import shutil\n",
    "from six.moves import urllib\n",
    "from pathlib import Path\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:21.735197Z",
     "iopub.status.busy": "2025-12-18T15:02:21.734991Z",
     "iopub.status.idle": "2025-12-18T15:02:23.396868Z",
     "shell.execute_reply": "2025-12-18T15:02:23.396511Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "using tobac version 1.6.2\n"
     ]
    }
   ],
   "source": [
    "# Import tobac itself\n",
    "import tobac\n",
    "print('using tobac version', str(tobac.__version__))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:23.416352Z",
     "iopub.status.busy": "2025-12-18T15:02:23.416075Z",
     "iopub.status.idle": "2025-12-18T15:02:23.418226Z",
     "shell.execute_reply": "2025-12-18T15:02:23.417963Z"
    }
   },
   "outputs": [],
   "source": [
    "# Disable a few warnings:\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore', category=UserWarning, append=True)\n",
    "warnings.filterwarnings('ignore', category=RuntimeWarning, append=True)\n",
    "warnings.filterwarnings('ignore', category=FutureWarning, append=True)\n",
    "warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Feature detection** "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:23.419654Z",
     "iopub.status.busy": "2025-12-18T15:02:23.419553Z",
     "iopub.status.idle": "2025-12-18T15:02:23.421506Z",
     "shell.execute_reply": "2025-12-18T15:02:23.421233Z"
    }
   },
   "outputs": [],
   "source": [
    "#Set up directory to save output:\n",
    "savedir=Path(\"Save\")\n",
    "if not savedir.is_dir():\n",
    "    savedir.mkdir()\n",
    "plot_dir=Path(\"Plot\")\n",
    "if not plot_dir.is_dir():\n",
    "    plot_dir.mkdir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:23.422897Z",
     "iopub.status.busy": "2025-12-18T15:02:23.422784Z",
     "iopub.status.idle": "2025-12-18T15:02:23.426289Z",
     "shell.execute_reply": "2025-12-18T15:02:23.426024Z"
    }
   },
   "outputs": [],
   "source": [
    "data_out=Path('../../../examples')\n",
    "# Download the data: This only has to be done once for all tobac examples and can take a while\n",
    "data_file = list(data_out.rglob('data/Example_input_Precip.nc'))\n",
    "if len(data_file) == 0:\n",
    "    file_path='https://zenodo.org/records/3195910/files/climate-processes/tobac_example_data-v1.0.1.zip'\n",
    "    #file_path='http://zenodo..'\n",
    "    tempfile=Path('temp.zip')\n",
    "    print('start downloading data')\n",
    "    request=urllib.request.urlretrieve(file_path, tempfile)\n",
    "    print('start extracting data')\n",
    "    shutil.unpack_archive(tempfile, data_out)\n",
    "    tempfile.unlink()\n",
    "    print('data extracted')\n",
    "    data_file = list(data_out.rglob('data/Example_input_Precip.nc'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:23.427443Z",
     "iopub.status.busy": "2025-12-18T15:02:23.427365Z",
     "iopub.status.idle": "2025-12-18T15:02:23.968606Z",
     "shell.execute_reply": "2025-12-18T15:02:23.968298Z"
    }
   },
   "outputs": [],
   "source": [
    "Precip=xr.open_dataset(data_file[0])['surface_precipitation_average']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:23.970233Z",
     "iopub.status.busy": "2025-12-18T15:02:23.970050Z",
     "iopub.status.idle": "2025-12-18T15:02:24.079363Z",
     "shell.execute_reply": "2025-12-18T15:02:24.079016Z"
    }
   },
   "outputs": [],
   "source": [
    "parameters_features={}\n",
    "parameters_features['position_threshold']='weighted_diff'\n",
    "parameters_features['sigma_threshold']=0.5\n",
    "parameters_features['min_distance']=0\n",
    "parameters_features['sigma_threshold']=1\n",
    "parameters_features['threshold']=[1,2,3,4,5,10,15] #mm/h\n",
    "parameters_features['n_erosion_threshold']=0\n",
    "parameters_features['n_min_threshold']=3\n",
    "\n",
    "# get temporal and spation resolution of the data\n",
    "dxy,dt=tobac.get_spacings(Precip)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:24.081344Z",
     "iopub.status.busy": "2025-12-18T15:02:24.081211Z",
     "iopub.status.idle": "2025-12-18T15:02:25.033298Z",
     "shell.execute_reply": "2025-12-18T15:02:25.033045Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "starting feature detection based on multiple thresholds\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "feature detection done\n",
      "features saved\n"
     ]
    }
   ],
   "source": [
    "# Feature detection based on based on surface precipitation field and a range of thresholds\n",
    "print('starting feature detection based on multiple thresholds')\n",
    "Features= tobac.feature_detection_multithreshold(Precip,dxy,**parameters_features) \n",
    "print('feature detection done')\n",
    "Features.to_hdf(savedir / 'Features.h5','table')\n",
    "print('features saved')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Segmentation** "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:25.034739Z",
     "iopub.status.busy": "2025-12-18T15:02:25.034546Z",
     "iopub.status.idle": "2025-12-18T15:02:25.039823Z",
     "shell.execute_reply": "2025-12-18T15:02:25.039585Z"
    }
   },
   "outputs": [],
   "source": [
    "# Dictionary containing keyword arguments for segmentation step:\n",
    "parameters_segmentation={}\n",
    "parameters_segmentation['method']='watershed'\n",
    "parameters_segmentation['threshold']=1  # mm/h mixing ratio\n",
    "\n",
    "# get temporal and spation resolution of the data\n",
    "dxy,dt=tobac.get_spacings(Precip)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:25.041009Z",
     "iopub.status.busy": "2025-12-18T15:02:25.040901Z",
     "iopub.status.idle": "2025-12-18T15:02:25.589778Z",
     "shell.execute_reply": "2025-12-18T15:02:25.589523Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting segmentation based on surface precipitation\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "segmentation based on surface precipitation performed, start saving results to files\n",
      "segmentation surface precipitation performed and saved\n"
     ]
    }
   ],
   "source": [
    "# Perform Segmentation and save resulting mask to NetCDF file:\n",
    "print('Starting segmentation based on surface precipitation')\n",
    "Mask_Precip,Features_Precip=tobac.segmentation_2D(Features,Precip,dxy,**parameters_segmentation)\n",
    "print('segmentation based on surface precipitation performed, start saving results to files')\n",
    "Mask_Precip.to_netcdf(savedir / 'Mask_segmentation_precip.nc', encoding={\"segmentation_mask\":{\"zlib\":True, \"complevel\":4}})\n",
    "Features_Precip.to_hdf(savedir / 'Features_Precip.h5', 'table')\n",
    "print('segmentation surface precipitation performed and saved')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Get bulk statistics from segmentation mask file**\n",
    "\n",
    "You can decide which statistics to calculate by providing a dictionary with the name of the metric as keys (this will be the name of the column added to the dataframe) and functions as values. Note that it is also possible to provide input parameter to these functions. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:25.591161Z",
     "iopub.status.busy": "2025-12-18T15:02:25.591059Z",
     "iopub.status.idle": "2025-12-18T15:02:25.592878Z",
     "shell.execute_reply": "2025-12-18T15:02:25.592600Z"
    }
   },
   "outputs": [],
   "source": [
    "from tobac.utils import get_statistics_from_mask"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-07-10T23:49:21.717102Z",
     "iopub.status.busy": "2023-07-10T23:49:21.716272Z",
     "iopub.status.idle": "2023-07-10T23:49:21.985540Z",
     "shell.execute_reply": "2023-07-10T23:49:21.984704Z"
    }
   },
   "source": [
    "#### Defining the dictionary for the statistics to be calculated "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:25.594125Z",
     "iopub.status.busy": "2025-12-18T15:02:25.594025Z",
     "iopub.status.idle": "2025-12-18T15:02:25.595767Z",
     "shell.execute_reply": "2025-12-18T15:02:25.595564Z"
    }
   },
   "outputs": [],
   "source": [
    "statistics = {}\n",
    "statistics['mean_precip'] = np.mean\n",
    "statistics['total_precip'] = np.sum\n",
    "statistics['max_precip'] = np.max"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For some functions, we need to provide additional input parameters, e.g. [np.percentile()](https://numpy.org/doc/stable/reference/generated/numpy.percentile.html). These can be provided as key word arguments in form of a dictionary. So instead of the function, you can provide a tuple with both the function and its respective input parameters: \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:25.597067Z",
     "iopub.status.busy": "2025-12-18T15:02:25.596921Z",
     "iopub.status.idle": "2025-12-18T15:02:25.598823Z",
     "shell.execute_reply": "2025-12-18T15:02:25.598581Z"
    }
   },
   "outputs": [],
   "source": [
    "statistics['percentiles'] = (np.percentile, {'q': [95,99]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:25.599942Z",
     "iopub.status.busy": "2025-12-18T15:02:25.599852Z",
     "iopub.status.idle": "2025-12-18T15:02:26.292609Z",
     "shell.execute_reply": "2025-12-18T15:02:26.292296Z"
    }
   },
   "outputs": [],
   "source": [
    "features_with_stats = get_statistics_from_mask(Features, Mask_Precip, Precip, statistic=statistics)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Look at the output: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:26.294212Z",
     "iopub.status.busy": "2025-12-18T15:02:26.294121Z",
     "iopub.status.idle": "2025-12-18T15:02:26.297118Z",
     "shell.execute_reply": "2025-12-18T15:02:26.296890Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.629695\n",
       "1    1.409547\n",
       "2    2.441526\n",
       "3    1.938501\n",
       "4    2.486886\n",
       "Name: mean_precip, dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features_with_stats.mean_precip.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:26.298298Z",
     "iopub.status.busy": "2025-12-18T15:02:26.298211Z",
     "iopub.status.idle": "2025-12-18T15:02:26.300739Z",
     "shell.execute_reply": "2025-12-18T15:02:26.300454Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    16.296951\n",
       "1    14.095468\n",
       "2    26.856783\n",
       "3    36.831512\n",
       "4    49.737709\n",
       "Name: total_precip, dtype: object"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features_with_stats.total_precip.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-12-18T15:02:26.301994Z",
     "iopub.status.busy": "2025-12-18T15:02:26.301899Z",
     "iopub.status.idle": "2025-12-18T15:02:26.304713Z",
     "shell.execute_reply": "2025-12-18T15:02:26.304483Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      ([2.221776068210602, 2.276183712482452],)\n",
       "1    ([1.8030404090881347, 1.8164567756652832],)\n",
       "2      ([3.710712432861328, 3.759503173828125],)\n",
       "3      ([3.940941762924194, 4.042321195602417],)\n",
       "4     ([4.087516045570374, 4.3222578477859495],)\n",
       "Name: percentiles, dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features_with_stats.percentiles.head()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}