1"""
2Module provides functions for loading different GECCO water quality data sets.
3
4+------------------------------+---------------------------------------------------------------------------------------------+
5| GECCO Water Quality 2017 | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2017_water_quality_data` |
6+------------------------------+---------------------------------------------------------------------------------------------+
7| GECCO Water Quality 2018 | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2018_water_quality_data` |
8+------------------------------+---------------------------------------------------------------------------------------------+
9| GECCO Water Quality 2019 | :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2019_water_quality_data` |
10+------------------------------+---------------------------------------------------------------------------------------------+
11
12Note that the scoring/evaluation algorithm is the same for all GECCO water quality benchmarks
13and is implemented in
14:func:`~epyt_flow.data.benchmarks.gecco_water_quality.compute_evaluation_score`.
15"""
16import os
17from typing import Union
18import numpy as np
19import pandas as pd
20from sklearn.metrics import f1_score
21
22from ...utils import get_temp_folder, download_if_necessary
23
24
[docs]
25def compute_evaluation_score(y_pred: np.ndarray, y: np.ndarray) -> float:
26 """
27 Evaluates the performance of a detection method.
28
29 .. note::
30 All GECCO water quality challenges use the F1-score for evaluation.
31
32 Parameters
33 ----------
34 y_pred : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
35 Event indication prediction over time
36 y : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
37 Ground truth event indication over time.
38
39 Returns
40 -------
41 `float`
42 Evaluation score.
43 """
44 return f1_score(y, y_pred)
45
46
[docs]
47def load_gecco2017_water_quality_data(download_dir: str = None, return_X_y: bool = True,
48 verbose: bool = True
49 ) -> Union[pd.DataFrame, tuple[np.ndarray, np.ndarray]]:
50 """
51 GECCO Industrial Challenge 2017 Dataset: A water quality dataset for the
52 "Monitoring of drinking-water quality" competition organized by M. Friese, J. Stork,
53 A. Fischbach, M. Rebolledo, T. Bartz-Beielstein at the Genetic and Evolutionary
54 Computation Conference 2017, Berlin, Germany
55
56 This is a benchmark for anomaly detection algorithms on water quality. The data is provided by
57 the "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this
58 data set, 9 numeric water quality features are given at a sampling rate of 1 min over approx.
59 3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification.
60
61 More information can be found at https://zenodo.org/records/3884465 and
62 http://www.spotseven.de/gecco-challenge/gecco-challenge-2017/
63
64 .. note::
65
66 Note that this is NOT a simulated scenario and therefore only the final
67 data set is provided.
68
69 Parameters
70 ----------
71 download_dir : `str`, optional
72 Path to the data files -- if None, the temp folder will be used.
73 If the path does not exist, the data files will be downloaded to the given path.
74
75 The default is None.
76 return_X_y : `bool`, optional
77 If True, the data is returned together with the labels as two Numpy arrays,
78 otherwise the data is returned as Pandas data frame.
79
80 The default is True.
81 verbose : `bool`, optional
82 If True, a progress bar is shown while downloading files.
83
84 The default is True.
85
86 Returns
87 -------
88 `pandas.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ or `tuple[numpy.ndarray, numpy.ndarray] <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
89 The benchmark data set as either a Pandas data frame or as a pair of (X, y) Numpy arrays.
90 """
91 url_data = "https://zenodo.org/records/3884465/files/1_gecco2017_water_quality.csv?download=1"
92
93 download_dir = download_dir if download_dir is not None else get_temp_folder()
94 f_in = os.path.join(download_dir, "gecco2017_water_quality.csv")
95
96 download_if_necessary(f_in, url_data, verbose)
97
98 # Load and return data
99 df_data = pd.read_csv(f_in, index_col=0)
100
101 if return_X_y is False:
102 return df_data
103 else:
104 y = df_data["EVENT"].to_numpy().astype(np.int8)
105 del df_data["EVENT"]
106
107 del df_data["Time"]
108 X = df_data.to_numpy()
109
110 return X, y
111
112
[docs]
113def load_gecco2018_water_quality_data(download_dir: str = None, return_X_y: bool = True,
114 verbose: bool = True
115 ) -> Union[pd.DataFrame, tuple[np.ndarray, np.ndarray]]:
116 """
117 GECCO Industrial Challenge 2018 Dataset: A water quality dataset for the
118 "Internet of Things: Online Anomaly Detection for Drinking Water Quality" competition
119 organized by F. Rehbach, M. Rebolledo, S. Moritz, S. Chandrasekaran, T. Bartz-Beielstein at
120 the Genetic and Evolutionary Computation Conference 2018, Kyoto, Japan.
121
122 This is a benchmark
123 (based on
124 :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2017_water_quality_data`)
125 for anomaly detection algorithms on water quality. The data is provided by the
126 "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this
127 data set, 9 numeric water quality features are given at a sampling rate of 1 min over approx.
128 3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification.
129
130 More information can be found at https://zenodo.org/records/3884398 and
131 http://www.spotseven.de/gecco/gecco-challenge/gecco-challenge-2018/
132
133 .. note::
134
135 Note that this is NOT a simulated scenario and therefore only the final
136 data set is provided.
137
138 Parameters
139 ----------
140 download_dir : `str`, optional
141 Path to the data files -- if None, the temp folder will be used.
142 If the path does not exist, the data files will be downloaded to the given path.
143
144 The default is None.
145 return_X_y : `bool`, optional
146 If True, the data is returned together with the labels as two Numpy arrays,
147 otherwise the data is returned as Pandas data frame.
148
149 The default is True.
150 verbose : `bool`, optional
151 If True, a progress bar is shown while downloading files.
152
153 The default is True.
154
155 Returns
156 -------
157 `pandas.DataFrame <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ or `tuple[numpy.ndarray, numpy.ndarray] <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
158 The benchmark data set as either a Pandas data frame or as a pair of (X, y) Numpy arrays.
159 """
160 # Download data if necessary
161 url_data = "https://zenodo.org/records/3884398/files/1_gecco2018_water_quality.csv?download=1"
162
163 download_dir = download_dir if download_dir is not None else get_temp_folder()
164 f_in = os.path.join(download_dir, "gecco2018_water_quality.csv")
165
166 download_if_necessary(f_in, url_data, verbose)
167
168 # Load and return data
169 df_data = pd.read_csv(f_in, index_col=0)
170
171 if return_X_y is False:
172 return df_data
173 else:
174 y = df_data["EVENT"].to_numpy().astype(np.int8)
175 del df_data["EVENT"]
176
177 del df_data["Time"]
178 X = df_data.to_numpy()
179
180 return X, y
181
182
[docs]
183def load_gecco2019_water_quality_data(download_dir: str = None, return_X_y: bool = True,
184 verbose: bool = True) -> dict:
185 """
186 GECCO Industrial Challenge 2019 Dataset: A water quality dataset for the "Internet of Things:
187 Online Event Detection for Drinking Water Quality Control" competition organized by
188 F. Rehbach, S. Moritz, T. Bartz-Beielstein at the Genetic and Evolutionary Computation
189 Conference 2019, Prague, Czech Republic.
190
191 This is a benchmark
192 (based on
193 :func:`~epyt_flow.data.benchmarks.gecco_water_quality.load_gecco2018_water_quality_data`)
194 for anomaly detection algorithms on water quality. The data is provided by the
195 "Thüringer Fernwasserversorgung" (Germany) and constitutes a real-world data set. In this
196 data set, 6 numeric water quality features are given at a sampling rate of 1 min over approx.
197 3 month. The goal is to predict the presence of an anomaly -- i.e. binary classification.
198 The data set itself comes in three splits: A train set, a validation set, and a test set.
199
200 More information can be found at https://zenodo.org/records/4304080 and
201 https://www.th-koeln.de/informatik-und-ingenieurwissenschaften/gecco-challenge-2019_63244.php
202
203 .. note::
204
205 Note that this is NOT a simulated scenario and therefore only the final
206 data set is provided.
207
208 Parameters
209 ----------
210 download_dir : `str`, optional
211 Path to the data files -- if None, the temp folder will be used.
212 If the path does not exist, the data files will be downloaded to the given path.
213
214 The default is None.
215 return_X_y : `bool`, optional
216 If True, the data is returned together with the labels as two Numpy arrays,
217 otherwise the data is returned as Pandas data frame.
218
219 The default is True.
220 verbose : `bool`, optional
221 If True, a progress bar is shown while downloading files.
222
223 The default is True.
224
225 Returns
226 -------
227 `dict`
228 The data set as a dictionary with entries "train", "validation", and "test" containing
229 the respective data.
230 """
231 # Download data if necessary
232 download_dir = download_dir if download_dir is not None else get_temp_folder()
233
234 base_url = "https://zenodo.org/records/4304080/files/"
235 url_train_data = base_url + "7_gecco2019_train_water_quality.csv?download=1"
236 url_valid_data = base_url + "8_gecco2019_valid_water_qulity.csv?download=1"
237 url_test_data = base_url + "6_gecco2019_test_water_quality.csv?download=1"
238
239 f_train_in = os.path.join(download_dir, "gecco2019_train_water_quality.csv")
240 f_valid_in = os.path.join(download_dir, "gecco2019_valid_water_qulity.csv")
241 f_test_in = os.path.join(download_dir, "gecco2019_test_water_quality.csv")
242
243 download_if_necessary(f_train_in, url_train_data, verbose)
244 download_if_necessary(f_valid_in, url_valid_data, verbose)
245 download_if_necessary(f_test_in, url_test_data, verbose)
246
247 # Load and return data
248 df_data_train = pd.read_csv(f_train_in, index_col=0)
249 df_data_valid = pd.read_csv(f_valid_in, index_col=0)
250 df_data_test = pd.read_csv(f_test_in, index_col=0)
251
252 if return_X_y is False:
253 return {"train": df_data_train, "validation": df_data_valid, "test": df_data_test}
254 else:
255 r = {"train": None, "validation": None, "test": None}
256
257 for k, df_data in zip(["train", "validation", "test"],
258 [df_data_train, df_data_valid, df_data_test]):
259 y = df_data["Event"].to_numpy().astype(np.int8)
260 del df_data["Event"]
261
262 del df_data["Time"]
263 X = df_data.to_numpy()
264
265 r[k] = (X, y)
266
267 return r