Source code for epyt_flow.data.benchmarks.gecco_water_quality

  1"""
  2Module provides functions for loading different GECCO water quality data sets.
  3
  4+------------------------------+---------------------------------------------------------------------------------------------+
  5| GECCO Water Quality 2017     | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2017_water_quality_data`    |
  6+------------------------------+---------------------------------------------------------------------------------------------+
  7| GECCO Water Quality 2018     | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2018_water_quality_data`    |
  8+------------------------------+---------------------------------------------------------------------------------------------+
  9| GECCO Water Quality 2019     | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2019_water_quality_data`    |
 10+------------------------------+---------------------------------------------------------------------------------------------+
 11
 12Note that the scoring/evaluation algorithm is the same for all GECCO water quality benchmarks
 13and is implemented in
 14:func:`~epyt_flow.data.benchmarks.gecco_water_quality.compute_evaluation_score`.
 15"""
 16import os
 17from typing import Union
 18import numpy as np
 19import pandas as pd
 20from sklearn.metrics import f1_score
 21
 22from ...utils import get_temp_folder, download_if_necessary
 23
 24

[docs]
 25def compute_evaluation_score(y_pred: np.ndarray, y: np.ndarray) -> float:
 26    """
 27    Evaluates the performance of a detection method.
 28
 29    .. note::
 30        All GECCO water quality challenges use the F1-score for evaluation.
 31
 32    Parameters
 33    ----------
 34    y_pred : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
 35        Event indication prediction over time
 36    y : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
 37        Ground truth event indication over time.
 38
 39    Returns
 40    -------
 41    `float`
 42        Evaluation score.
 43    """
 44    return f1_score(y, y_pred)

 45
 46

[docs]
 47def load_gecco2017_water_quality_data(download_dir: str = None, return_X_y: bool = True,
 48                                      verbose: bool = True
 49                                      ) -> Union[pd.DataFrame, tuple[np.ndarray, np.ndarray]]:
 50    """
 51    GECCO Industrial Challenge 2017 Dataset: A water quality dataset for the
 52    "Monitoring of drinking-water quality" competition organized by M. Friese, J. Stork,
 53    A. Fischbach, M. Rebolledo, T. Bartz-Beielstein at the Genetic and Evolutionary
 54    Computation Conference 2017, Berlin, Germany
 55
 56    This is a benchmark for anomaly detection algorithms on water quality. The data is provided by
 57    the "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this
 58    data set, 9 numeric water quality features are given at a sampling rate of 1 min over approx.
 59    3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification.
 60
 61    More information can be found at https://zenodo.org/records/3884465 and
 62    http://www.spotseven.de/gecco-challenge/gecco-challenge-2017/
 63
 64    .. note::
 65
 66        Note that this is NOT a simulated scenario and therefore only the final
 67        data set is provided.
 68
 69    Parameters
 70    ----------
 71    download_dir : `str`, optional
 72        Path to the data files -- if None, the temp folder will be used.
 73        If the path does not exist, the data files will be downloaded to the given path.
 74
 75        The default is None.
 76    return_X_y : `bool`, optional
 77        If True, the data is returned together with the labels as two Numpy arrays,
 78        otherwise the data is returned as Pandas data frame.
 79
 80        The default is True.
 81    verbose : `bool`, optional
 82        If True, a progress bar is shown while downloading files.
 83
 84        The default is True.
 85
 86    Returns
 87    -------
 88    `pandas.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ or `tuple[numpy.ndarray, numpy.ndarray] <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
 89        The benchmark data set as either a Pandas data frame or as a pair of (X, y) Numpy arrays.
 90    """
 91    url_data = "https://zenodo.org/records/3884465/files/1_gecco2017_water_quality.csv?download=1"
 92
 93    download_dir = download_dir if download_dir is not None else get_temp_folder()
 94    f_in = os.path.join(download_dir, "gecco2017_water_quality.csv")
 95
 96    download_if_necessary(f_in, url_data, verbose)
 97
 98    # Load and return data
 99    df_data = pd.read_csv(f_in, index_col=0)
100
101    if return_X_y is False:
102        return df_data
103    else:
104        y = df_data["EVENT"].to_numpy().astype(np.int8)
105        del df_data["EVENT"]
106
107        del df_data["Time"]
108        X = df_data.to_numpy()
109
110        return X, y

111
112

[docs]
113def load_gecco2018_water_quality_data(download_dir: str = None, return_X_y: bool = True,
114                                      verbose: bool = True
115                                      ) -> Union[pd.DataFrame, tuple[np.ndarray, np.ndarray]]:
116    """
117    GECCO Industrial Challenge 2018 Dataset: A water quality dataset for the
118    "Internet of Things: Online Anomaly Detection for Drinking Water Quality" competition
119    organized by F. Rehbach, M. Rebolledo, S. Moritz, S. Chandrasekaran, T. Bartz-Beielstein at
120    the Genetic and Evolutionary Computation Conference 2018, Kyoto, Japan.
121
122    This is a benchmark
123    (based on
124    :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2017_water_quality_data`)
125    for anomaly detection algorithms on water quality. The data is provided by the
126    "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this
127    data set, 9 numeric water quality features are given at a sampling rate of 1 min over approx.
128    3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification.
129
130    More information can be found at https://zenodo.org/records/3884398 and
131    http://www.spotseven.de/gecco/gecco-challenge/gecco-challenge-2018/
132
133    .. note::
134
135        Note that this is NOT a simulated scenario and therefore only the final
136        data set is provided.
137
138    Parameters
139    ----------
140    download_dir : `str`, optional
141        Path to the data files -- if None, the temp folder will be used.
142        If the path does not exist, the data files will be downloaded to the given path.
143
144        The default is None.
145    return_X_y : `bool`, optional
146        If True, the data is returned together with the labels as two Numpy arrays,
147        otherwise the data is returned as Pandas data frame.
148
149        The default is True.
150    verbose : `bool`, optional
151        If True, a progress bar is shown while downloading files.
152
153        The default is True.
154
155    Returns
156    -------
157    `pandas.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ or `tuple[numpy.ndarray, numpy.ndarray] <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
158        The benchmark data set as either a Pandas data frame or as a pair of (X, y) Numpy arrays.
159    """
160    # Download data if necessary
161    url_data = "https://zenodo.org/records/3884398/files/1_gecco2018_water_quality.csv?download=1"
162
163    download_dir = download_dir if download_dir is not None else get_temp_folder()
164    f_in = os.path.join(download_dir, "gecco2018_water_quality.csv")
165
166    download_if_necessary(f_in, url_data, verbose)
167
168    # Load and return data
169    df_data = pd.read_csv(f_in, index_col=0)
170
171    if return_X_y is False:
172        return df_data
173    else:
174        y = df_data["EVENT"].to_numpy().astype(np.int8)
175        del df_data["EVENT"]
176
177        del df_data["Time"]
178        X = df_data.to_numpy()
179
180        return X, y

181
182

[docs]
183def load_gecco2019_water_quality_data(download_dir: str = None, return_X_y: bool = True,
184                                      verbose: bool = True) -> dict:
185    """
186    GECCO Industrial Challenge 2019 Dataset: A water quality dataset for the "Internet of Things:
187    Online Event Detection for Drinking Water Quality Control" competition organized by
188    F. Rehbach, S. Moritz, T. Bartz-Beielstein at the Genetic and Evolutionary Computation
189    Conference 2019, Prague, Czech Republic.
190
191    This is a benchmark
192    (based on
193    :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2018_water_quality_data`)
194    for anomaly detection algorithms on water quality. The data is provided by the
195    "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this
196    data set, 6 numeric water quality features are given at a sampling rate of 1 min over approx.
197    3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification.
198    The data set itself comes in three splits: A train set, a validation set, and a test set.
199
200    More information can be found at https://zenodo.org/records/4304080 and
201    https://www.th-koeln.de/informatik-und-ingenieurwissenschaften/gecco-challenge-2019_63244.php
202
203    .. note::
204
205        Note that this is NOT a simulated scenario and therefore only the final
206        data set is provided.
207
208    Parameters
209    ----------
210    download_dir : `str`, optional
211        Path to the data files -- if None, the temp folder will be used.
212        If the path does not exist, the data files will be downloaded to the given path.
213
214        The default is None.
215    return_X_y : `bool`, optional
216        If True, the data is returned together with the labels as two Numpy arrays,
217        otherwise the data is returned as Pandas data frame.
218
219        The default is True.
220    verbose : `bool`, optional
221        If True, a progress bar is shown while downloading files.
222
223        The default is True.
224
225    Returns
226    -------
227    `dict`
228        The data set as a dictionary with entries "train", "validation", and "test" containing
229        the respective data.
230    """
231    # Download data if necessary
232    download_dir = download_dir if download_dir is not None else get_temp_folder()
233
234    base_url = "https://zenodo.org/records/4304080/files/"
235    url_train_data = base_url + "7_gecco2019_train_water_quality.csv?download=1"
236    url_valid_data = base_url + "8_gecco2019_valid_water_qulity.csv?download=1"
237    url_test_data = base_url + "6_gecco2019_test_water_quality.csv?download=1"
238
239    f_train_in = os.path.join(download_dir, "gecco2019_train_water_quality.csv")
240    f_valid_in = os.path.join(download_dir, "gecco2019_valid_water_qulity.csv")
241    f_test_in = os.path.join(download_dir, "gecco2019_test_water_quality.csv")
242
243    download_if_necessary(f_train_in, url_train_data, verbose)
244    download_if_necessary(f_valid_in, url_valid_data, verbose)
245    download_if_necessary(f_test_in, url_test_data, verbose)
246
247    # Load and return data
248    df_data_train = pd.read_csv(f_train_in, index_col=0)
249    df_data_valid = pd.read_csv(f_valid_in, index_col=0)
250    df_data_test = pd.read_csv(f_test_in, index_col=0)
251
252    if return_X_y is False:
253        return {"train": df_data_train, "validation": df_data_valid, "test": df_data_test}
254    else:
255        r = {"train": None, "validation": None, "test": None}
256
257        for k, df_data in zip(["train", "validation", "test"],
258                              [df_data_train, df_data_valid, df_data_test]):
259            y = df_data["Event"].to_numpy().astype(np.int8)
260            del df_data["Event"]
261
262            del df_data["Time"]
263            X = df_data.to_numpy()
264
265            r[k] = (X, y)
266
267        return r