1"""
2The BATtle of the Attack Detection ALgorithms (*BATADAL*) by Riccardo Taormina, Stefano Galelli,
3Nils Ole Tippenhauer, Avi Ostfeld, Elad Salomons, Demetrios Eliades is a competition on planning
4and management of water networks undertaken within the Water Distribution Systems Analysis
5Symposium. The goal of the battle was to compare the performance of algorithms for the detection
6of cyber-physical attacks, whose frequency has increased in the last few years along with the
7adoption of smart water technologies. The design challenge was set for the C-Town network,
8a real-world, medium-sized water distribution system operated through programmable logic
9controllers and a supervisory control and data acquisition (SCADA) system. Participants were
10provided with data sets containing (simulated) SCADA observations, and challenged to design
11an attack detection algorithm. The effectiveness of all submitted algorithms was evaluated in
12terms of time-to-detection and classification accuracy. Seven teams participated in the battle
13and proposed a variety of successful approaches leveraging data analysis, model-based detection
14mechanisms, and rule checking. Results were presented at the Water Distribution Systems Analysis
15Symposium (World Environmental and Water Resources Congress) in Sacramento, California on
16May 21-25, 2017.
17The `paper <https://doi.org/10.1061/(ASCE)WR.1943-5452.0000969>`_ summarizes the BATADAL
18problem, proposed algorithms, results, and future research directions.
19
20See https://www.batadal.net/ for details.
21
22This module provides functions for loading the original BATADAL data set
23:func:`~epyt_flow.data.benchmarks.batadal.load_data`, as well as functions for loading the
24scenarios :func:`~epyt_flow.data.benchmarks.batadal.load_scenario` and pre-generated
25SCADA data :func:`~epyt_flow.data.benchmarks.batadal.load_scada_data`.
26"""
27import os
28from typing import Any
29from datetime import datetime
30import pandas as pd
31import numpy as np
32
33from .batadal_data import TRAINING_DATA_2_ATTACKS_TIME, TRAINING_DATA_2_START_TIME, \
34 TEST_DATA_ATTACKS_TIME, TEST_DATA_START_TIME
35from ...utils import get_temp_folder, unpack_zip_archive, to_seconds, download_if_necessary
36from ...simulation import ScenarioConfig
37
38
39def __parse_attacks_time(start_time: str, attacks_time):
40 events = []
41 for event in attacks_time.splitlines():
42 # Parse entry
43 items = [i.strip() for i in event.split(",")]
44
45 event_start_time = int((datetime.strptime(items[0], "%d/%m/%Y %H:%M") - start_time)
46 .total_seconds())
47 event_end_time = int((datetime.strptime(items[1], "%d/%m/%Y %H:%M") - start_time)
48 .total_seconds())
49
50 events.append((event_start_time, event_end_time))
51
52 return events
53
54
[docs]
55def load_data(download_dir: str = None, return_X_y: bool = False,
56 return_ground_truth: bool = False, return_features_desc: bool = False,
57 verbose: bool = True) -> dict:
58 """
59 Loads the original BATADAL competition data.
60
61 Parameters
62 ----------
63 download_dir : `str`, optional
64 Path to the data files -- if None, the temp folder will be used.
65 If the path does not exist, the data files will be downloaded to the given path.
66
67 The default is None.
68 return_X_y : `bool`, optional
69 If True, the data together with the labels is returned as pairs of Numpy arrays.
70 Otherwise, the data is returned as Pandas data frames.
71
72 The default is False.
73 return_ground_truth : `bool`
74 If True and if `return_X_y` is True, the ground truth labels are included in the
75 returned dictionary -- note that the labels provided in the benchmark constitute
76 a partial labeling only.
77
78 The default is False.
79 return_features_desc : `bool`
80 If True and if `return_X_y` is True, feature names (i.e. descriptions) are included
81 in the returned dictionary.
82
83 The default is False.
84 verbose : `bool`, optional
85 If True, a progress bar is shown while downloading files.
86
87 The default is True.
88
89 Returns
90 -------
91 `dict`
92 Dictionary of the loaded benchmark data. The dictionary contains the two training
93 data sets ("train_1" and "train_2"), as well as the test data set ("test").
94 If `return_X_y` is False, each dictionary entry is a `Pandas dataframe <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_.
95 Otherwise, it is a tuple of sensor readings and labels (except for the test set) --
96 if `return_ground_truth` is True or `return_features_desc` is True, the corresponding
97 data is appended to the tuple.
98 """
99 download_dir = download_dir if download_dir is not None else get_temp_folder()
100
101 # Download data
102 training_data_1_url = "https://www.batadal.net/data/BATADAL_dataset03.csv"
103 training_data_2_url = "https://www.batadal.net/data/BATADAL_dataset04.csv"
104 test_data_url = "https://www.batadal.net/data/BATADAL_test_dataset.zip"
105
106 training_data_1_path = os.path.join(download_dir, "BATADAL_dataset03.csv")
107 training_data_2_path = os.path.join(download_dir, "BATADAL_dataset04.csv")
108
109 download_if_necessary(training_data_1_path, training_data_1_url, verbose)
110 download_if_necessary(training_data_2_path, training_data_2_url, verbose)
111
112 download_if_necessary(os.path.join(download_dir, "BATADAL_test_dataset.zip"),
113 test_data_url, verbose)
114 unpack_zip_archive(os.path.join(download_dir, "BATADAL_test_dataset.zip"), download_dir)
115
116 # Load and return data
117 df_train_1 = pd.read_csv(training_data_1_path)
118 df_train_2 = pd.read_csv(training_data_2_path)
119 df_test = pd.read_csv(os.path.join(download_dir, "BATADAL_test_dataset.csv"))
120
121 if return_X_y is True:
122 # Convert data to numpy
123 y_train_1 = df_train_1["ATT_FLAG"].to_numpy().astype(np.int8)
124 del df_train_1["ATT_FLAG"]
125 del df_train_1["DATETIME"]
126 X_train_1 = df_train_1.to_numpy()
127
128 y_train_2 = df_train_2[" ATT_FLAG"].to_numpy(copy=True)
129 idx = np.argwhere(y_train_2 == -999)
130 y_train_2[idx] = 0
131 y_train_2 = y_train_2.astype(np.int8)
132 del df_train_2[" ATT_FLAG"]
133 del df_train_2["DATETIME"]
134 X_train_2 = df_train_2.to_numpy()
135
136 del df_test["DATETIME"]
137 X_test = df_test.to_numpy()
138
139 # Create ground truth labels
140 hydraulic_time_step = to_seconds(minutes=15)
141 training_data_2_events_time = __parse_attacks_time(TRAINING_DATA_2_START_TIME,
142 TRAINING_DATA_2_ATTACKS_TIME)
143 test_data_events_time = __parse_attacks_time(TEST_DATA_START_TIME, TEST_DATA_ATTACKS_TIME)
144
145 y_train_2_truth = np.zeros(X_train_2.shape[0])
146 for event_start, event_end in training_data_2_events_time:
147 t0 = int(event_start / hydraulic_time_step)
148 t1 = int(event_end / hydraulic_time_step)
149 y_train_2_truth[t0:t1] = 1
150
151 y_test_truth = np.zeros(X_test.shape[0])
152 for event_start, event_end in test_data_events_time:
153 t0 = int(event_start / hydraulic_time_step)
154 t1 = int(event_end / hydraulic_time_step)
155 y_test_truth[t0:t1] = 1
156
157 # Create features' descriptions
158 features_desc = list(df_train_1.columns)
159 desc_mapping = {"PU": "Pump", "V": "Valve", "T": "Tank", "L": "Level", "S": "State",
160 "P": "Pressure", "F": "Flow"}
161 for i, f_desc in enumerate(features_desc):
162 pump = False
163 for k, value in desc_mapping.items():
164 if k in f_desc:
165 if k == "P" and pump is True:
166 continue
167 f_desc = f_desc.replace(k, value)
168 if k == "PU":
169 pump = True
170 features_desc[i] = f_desc
171
172 # Create final results
173 r = {"train_1": (X_train_1, y_train_1), "train_2": (X_train_2, y_train_2),
174 "test": X_test}
175
176 if return_ground_truth is True:
177 r["train_1"] = (r["train_1"][0], r["train_1"][1], y_train_1)
178 r["train_2"] = (r["train_2"][0], r["train_2"][1], y_train_2_truth)
179 r["test"] = (r["test"][0], y_test_truth)
180
181 if return_features_desc is True:
182 r["features_desc"] = features_desc
183
184 return r
185 else:
186 return {"train_1": df_train_1, "train_2": df_train_2, "test": df_test}
187
188
[docs]
189def load_scada_data(download_dir: str = None, return_X_y: bool = False,
190 return_ground_truth: bool = False, return_features_desc: bool = False,
191 verbose: bool = True) -> Any:
192 """
193 Loads the SCADA data of the simulated BATADAL benchmark scenario -- note that due to
194 randomness and undocumented aspects of the original BATADAL data set, these differ from
195 the original data set which can be loaded by calling
196 :func:`~epyt_flow.data.benchmarks.batadal.load_data`.
197
198 Parameters
199 ----------
200 download_dir : `str`, optional
201 Path to the data files -- if None, the temp folder will be used.
202 If the path does not exist, the data files will be downloaded to the given path.
203
204 The default is None.
205 return_X_y : `bool`, optional
206 If True, the data together with the labels is returned as pairs of Numpy arrays.
207 Otherwisen the data is returned as Pandas data frames.
208
209 The default is False.
210 return_ground_truth : `bool`
211 If True and if `return_X_y` is True, the ground truth labels are included in the
212 returned dictionary -- note that the labels provided in the benchmark constitute
213 a partial labeling only.
214
215 The default is False.
216 return_features_desc : `bool`
217 If True and if `return_X_y` is True, feature names (i.e. descriptions) are included
218 in the returned dictionary.
219
220 The default is False.
221 verbose : `bool`, optional
222 If True, a progress bar is shown while downloading files.
223
224 The default is True.
225 """
226 raise NotImplementedError()
227
228
[docs]
229def load_scenario(download_dir: str = None, verbose: bool = True) -> ScenarioConfig:
230 """
231 Creates and returns the BATADAL scenario -- it can be either modified or directly passed
232 to the simulator :class:`~epyt_flow.simulation.scenario_simulator.ScenarioSimulator`.
233
234 .. note::
235
236 Note that due to randomness and undocumented aspects of the original BATADAL benchmark,
237 the scenario simulation results differ from the original data set which can be loaded by
238 calling :func:`~epyt_flow.data.benchmarks.batadal.load_data`.
239
240 Parameters
241 ----------
242 download_dir : `str`, optional
243 Path to the data files -- if None, the temp folder will be used.
244 If the path does not exist, the data files will be downloaded to the given path.
245
246 The default is None.
247 verbose : `bool`, optional
248 If True, a progress bar is shown while downloading files.
249
250 The default is True.
251
252 Returns
253 -------
254 :class:`~epyt_flow.simulation.scenario_config.ScenarioConfig`
255 The BATADAL scenario.
256 """
257 raise NotImplementedError()