Source code for epyt_flow.data.benchmarks.gecco_water_quality

  1"""
  2Module provides functions for loading different GECCO water quality data sets.
  3
  4+------------------------------+---------------------------------------------------------------------------------------------+
  5| GECCO Water Quality 2017     | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2017_water_quality_data`    |
  6+------------------------------+---------------------------------------------------------------------------------------------+
  7| GECCO Water Quality 2018     | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2018_water_quality_data`    |
  8+------------------------------+---------------------------------------------------------------------------------------------+
  9| GECCO Water Quality 2019     | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2019_water_quality_data`    |
 10+------------------------------+---------------------------------------------------------------------------------------------+
 11
 12Note that the scoring/evaluation algorithm is the same for all GECCO water quality benchmarks
 13and is implemented in
 14:func:`~epyt_flow.data.benchmarks.gecco_water_quality.compute_evaluation_score`.
 15"""
 16import os
 17from typing import Union
 18import numpy as np
 19import pandas as pd
 20from sklearn.metrics import f1_score
 21
 22from ...utils import get_temp_folder, download_if_necessary
 23
 24
[docs] 25def compute_evaluation_score(y_pred: np.ndarray, y: np.ndarray) -> float: 26 """ 27 Evaluates the performance of a detection method. 28 29 .. note:: 30 All GECCO water quality challenges use the F1-score for evaluation. 31 32 Parameters 33 ---------- 34 y_pred : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_ 35 Event indication prediction over time 36 y : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_ 37 Ground truth event indication over time. 38 39 Returns 40 ------- 41 `float` 42 Evaluation score. 43 """ 44 return f1_score(y, y_pred)
45 46
[docs] 47def load_gecco2017_water_quality_data(download_dir: str = None, return_X_y: bool = True, 48 verbose: bool = True 49 ) -> Union[pd.DataFrame, tuple[np.ndarray, np.ndarray]]: 50 """ 51 GECCO Industrial Challenge 2017 Dataset: A water quality dataset for the 52 "Monitoring of drinking-water quality" competition organized by M. Friese, J. Stork, 53 A. Fischbach, M. Rebolledo, T. Bartz-Beielstein at the Genetic and Evolutionary 54 Computation Conference 2017, Berlin, Germany 55 56 This is a benchmark for anomaly detection algorithms on water quality. The data is provided by 57 the "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this 58 data set, 9 numeric water quality features are given at a sampling rate of 1 min over approx. 59 3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification. 60 61 More information can be found at https://zenodo.org/records/3884465 and 62 http://www.spotseven.de/gecco-challenge/gecco-challenge-2017/ 63 64 .. note:: 65 66 Note that this is NOT a simulated scenario and therefore only the final 67 data set is provided. 68 69 Parameters 70 ---------- 71 download_dir : `str`, optional 72 Path to the data files -- if None, the temp folder will be used. 73 If the path does not exist, the data files will be downloaded to the given path. 74 75 The default is None. 76 return_X_y : `bool`, optional 77 If True, the data is returned together with the labels as two Numpy arrays, 78 otherwise the data is returned as Pandas data frame. 79 80 The default is True. 81 verbose : `bool`, optional 82 If True, a progress bar is shown while downloading files. 83 84 The default is True. 85 86 Returns 87 ------- 88 `pandas.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ or `tuple[numpy.ndarray, numpy.ndarray] <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_ 89 The benchmark data set as either a Pandas data frame or as a pair of (X, y) Numpy arrays. 90 """ 91 url_data = "https://zenodo.org/records/3884465/files/1_gecco2017_water_quality.csv?download=1" 92 93 download_dir = download_dir if download_dir is not None else get_temp_folder() 94 f_in = os.path.join(download_dir, "gecco2017_water_quality.csv") 95 96 download_if_necessary(f_in, url_data, verbose) 97 98 # Load and return data 99 df_data = pd.read_csv(f_in, index_col=0) 100 101 if return_X_y is False: 102 return df_data 103 else: 104 y = df_data["EVENT"].to_numpy().astype(np.int8) 105 del df_data["EVENT"] 106 107 del df_data["Time"] 108 X = df_data.to_numpy() 109 110 return X, y
111 112
[docs] 113def load_gecco2018_water_quality_data(download_dir: str = None, return_X_y: bool = True, 114 verbose: bool = True 115 ) -> Union[pd.DataFrame, tuple[np.ndarray, np.ndarray]]: 116 """ 117 GECCO Industrial Challenge 2018 Dataset: A water quality dataset for the 118 "Internet of Things: Online Anomaly Detection for Drinking Water Quality" competition 119 organized by F. Rehbach, M. Rebolledo, S. Moritz, S. Chandrasekaran, T. Bartz-Beielstein at 120 the Genetic and Evolutionary Computation Conference 2018, Kyoto, Japan. 121 122 This is a benchmark 123 (based on 124 :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2017_water_quality_data`) 125 for anomaly detection algorithms on water quality. The data is provided by the 126 "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this 127 data set, 9 numeric water quality features are given at a sampling rate of 1 min over approx. 128 3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification. 129 130 More information can be found at https://zenodo.org/records/3884398 and 131 http://www.spotseven.de/gecco/gecco-challenge/gecco-challenge-2018/ 132 133 .. note:: 134 135 Note that this is NOT a simulated scenario and therefore only the final 136 data set is provided. 137 138 Parameters 139 ---------- 140 download_dir : `str`, optional 141 Path to the data files -- if None, the temp folder will be used. 142 If the path does not exist, the data files will be downloaded to the given path. 143 144 The default is None. 145 return_X_y : `bool`, optional 146 If True, the data is returned together with the labels as two Numpy arrays, 147 otherwise the data is returned as Pandas data frame. 148 149 The default is True. 150 verbose : `bool`, optional 151 If True, a progress bar is shown while downloading files. 152 153 The default is True. 154 155 Returns 156 ------- 157 `pandas.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ or `tuple[numpy.ndarray, numpy.ndarray] <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_ 158 The benchmark data set as either a Pandas data frame or as a pair of (X, y) Numpy arrays. 159 """ 160 # Download data if necessary 161 url_data = "https://zenodo.org/records/3884398/files/1_gecco2018_water_quality.csv?download=1" 162 163 download_dir = download_dir if download_dir is not None else get_temp_folder() 164 f_in = os.path.join(download_dir, "gecco2018_water_quality.csv") 165 166 download_if_necessary(f_in, url_data, verbose) 167 168 # Load and return data 169 df_data = pd.read_csv(f_in, index_col=0) 170 171 if return_X_y is False: 172 return df_data 173 else: 174 y = df_data["EVENT"].to_numpy().astype(np.int8) 175 del df_data["EVENT"] 176 177 del df_data["Time"] 178 X = df_data.to_numpy() 179 180 return X, y
181 182
[docs] 183def load_gecco2019_water_quality_data(download_dir: str = None, return_X_y: bool = True, 184 verbose: bool = True) -> dict: 185 """ 186 GECCO Industrial Challenge 2019 Dataset: A water quality dataset for the "Internet of Things: 187 Online Event Detection for Drinking Water Quality Control" competition organized by 188 F. Rehbach, S. Moritz, T. Bartz-Beielstein at the Genetic and Evolutionary Computation 189 Conference 2019, Prague, Czech Republic. 190 191 This is a benchmark 192 (based on 193 :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2018_water_quality_data`) 194 for anomaly detection algorithms on water quality. The data is provided by the 195 "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this 196 data set, 6 numeric water quality features are given at a sampling rate of 1 min over approx. 197 3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification. 198 The data set itself comes in three splits: A train set, a validation set, and a test set. 199 200 More information can be found at https://zenodo.org/records/4304080 and 201 https://www.th-koeln.de/informatik-und-ingenieurwissenschaften/gecco-challenge-2019_63244.php 202 203 .. note:: 204 205 Note that this is NOT a simulated scenario and therefore only the final 206 data set is provided. 207 208 Parameters 209 ---------- 210 download_dir : `str`, optional 211 Path to the data files -- if None, the temp folder will be used. 212 If the path does not exist, the data files will be downloaded to the given path. 213 214 The default is None. 215 return_X_y : `bool`, optional 216 If True, the data is returned together with the labels as two Numpy arrays, 217 otherwise the data is returned as Pandas data frame. 218 219 The default is True. 220 verbose : `bool`, optional 221 If True, a progress bar is shown while downloading files. 222 223 The default is True. 224 225 Returns 226 ------- 227 `dict` 228 The data set as a dictionary with entries "train", "validation", and "test" containing 229 the respective data. 230 """ 231 # Download data if necessary 232 download_dir = download_dir if download_dir is not None else get_temp_folder() 233 234 base_url = "https://zenodo.org/records/4304080/files/" 235 url_train_data = base_url + "7_gecco2019_train_water_quality.csv?download=1" 236 url_valid_data = base_url + "8_gecco2019_valid_water_qulity.csv?download=1" 237 url_test_data = base_url + "6_gecco2019_test_water_quality.csv?download=1" 238 239 f_train_in = os.path.join(download_dir, "gecco2019_train_water_quality.csv") 240 f_valid_in = os.path.join(download_dir, "gecco2019_valid_water_qulity.csv") 241 f_test_in = os.path.join(download_dir, "gecco2019_test_water_quality.csv") 242 243 download_if_necessary(f_train_in, url_train_data, verbose) 244 download_if_necessary(f_valid_in, url_valid_data, verbose) 245 download_if_necessary(f_test_in, url_test_data, verbose) 246 247 # Load and return data 248 df_data_train = pd.read_csv(f_train_in, index_col=0) 249 df_data_valid = pd.read_csv(f_valid_in, index_col=0) 250 df_data_test = pd.read_csv(f_test_in, index_col=0) 251 252 if return_X_y is False: 253 return {"train": df_data_train, "validation": df_data_valid, "test": df_data_test} 254 else: 255 r = {"train": None, "validation": None, "test": None} 256 257 for k, df_data in zip(["train", "validation", "test"], 258 [df_data_train, df_data_valid, df_data_test]): 259 y = df_data["Event"].to_numpy().astype(np.int8) 260 del df_data["Event"] 261 262 del df_data["Time"] 263 X = df_data.to_numpy() 264 265 r[k] = (X, y) 266 267 return r