Source code for epyt_flow.data.benchmarks.batadal

  1"""
  2The BATtle of the Attack Detection ALgorithms (*BATADAL*) by Riccardo Taormina, Stefano Galelli,
  3Nils Ole Tippenhauer, Avi Ostfeld, Elad Salomons, Demetrios Eliades is a competition on planning
  4and management of water networks undertaken within the Water Distribution Systems Analysis
  5Symposium. The goal of the battle was to compare the performance of algorithms for the detection
  6of cyber-physical attacks, whose frequency has increased in the last few years along with the
  7adoption of smart water technologies. The design challenge was set for the C-Town network,
  8a real-world, medium-sized water distribution system operated through programmable logic
  9controllers and a supervisory control and data acquisition (SCADA) system. Participants were
 10provided with data sets containing (simulated) SCADA observations, and challenged to design
 11an attack detection algorithm. The effectiveness of all submitted algorithms was evaluated in
 12terms of time-to-detection and classification accuracy. Seven teams participated in the battle
 13and proposed a variety of successful approaches leveraging data analysis, model-based detection
 14mechanisms, and rule checking. Results were presented at the Water Distribution Systems Analysis
 15Symposium (World Environmental and Water Resources Congress) in Sacramento, California on
 16May 21-25, 2017.
 17The `paper <https://doi.org/10.1061/(ASCE)WR.1943-5452.0000969>`_ summarizes the BATADAL
 18problem, proposed algorithms, results, and future research directions.
 19
 20See https://www.batadal.net/ for details.
 21
 22This module provides functions for loading the original BATADAL data set
 23:func:`~epyt_flow.data.benchmarks.batadal.load_data`, as well as functions for loading the
 24scenarios :func:`~epyt_flow.data.benchmarks.batadal.load_scenario` and pre-generated
 25SCADA data :func:`~epyt_flow.data.benchmarks.batadal.load_scada_data`.
 26"""
 27import os
 28from typing import Any
 29from datetime import datetime
 30import pandas as pd
 31import numpy as np
 32
 33from .batadal_data import TRAINING_DATA_2_ATTACKS_TIME, TRAINING_DATA_2_START_TIME, \
 34    TEST_DATA_ATTACKS_TIME, TEST_DATA_START_TIME
 35from ...utils import get_temp_folder, unpack_zip_archive, to_seconds, download_if_necessary
 36from ...simulation import ScenarioConfig
 37
 38
 39def __parse_attacks_time(start_time: str, attacks_time):
 40    events = []
 41    for event in attacks_time.splitlines():
 42        # Parse entry
 43        items = [i.strip() for i in event.split(",")]
 44
 45        event_start_time = int((datetime.strptime(items[0], "%d/%m/%Y %H:%M") - start_time)
 46                               .total_seconds())
 47        event_end_time = int((datetime.strptime(items[1], "%d/%m/%Y %H:%M") - start_time)
 48                             .total_seconds())
 49
 50        events.append((event_start_time, event_end_time))
 51
 52    return events
 53
 54
[docs] 55def load_data(download_dir: str = None, return_X_y: bool = False, 56 return_ground_truth: bool = False, return_features_desc: bool = False, 57 verbose: bool = True) -> dict: 58 """ 59 Loads the original BATADAL competition data. 60 61 Parameters 62 ---------- 63 download_dir : `str`, optional 64 Path to the data files -- if None, the temp folder will be used. 65 If the path does not exist, the data files will be downloaded to the given path. 66 67 The default is None. 68 return_X_y : `bool`, optional 69 If True, the data together with the labels is returned as pairs of Numpy arrays. 70 Otherwise, the data is returned as Pandas data frames. 71 72 The default is False. 73 return_ground_truth : `bool` 74 If True and if `return_X_y` is True, the ground truth labels are included in the 75 returned dictionary -- note that the labels provided in the benchmark constitute 76 a partial labeling only. 77 78 The default is False. 79 return_features_desc : `bool` 80 If True and if `return_X_y` is True, feature names (i.e. descriptions) are included 81 in the returned dictionary. 82 83 The default is False. 84 verbose : `bool`, optional 85 If True, a progress bar is shown while downloading files. 86 87 The default is True. 88 89 Returns 90 ------- 91 `dict` 92 Dictionary of the loaded benchmark data. The dictionary contains the two training 93 data sets ("train_1" and "train_2"), as well as the test data set ("test"). 94 If `return_X_y` is False, each dictionary entry is a `Pandas dataframe <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_. 95 Otherwise, it is a tuple of sensor readings and labels (except for the test set) -- 96 if `return_ground_truth` is True or `return_features_desc` is True, the corresponding 97 data is appended to the tuple. 98 """ 99 download_dir = download_dir if download_dir is not None else get_temp_folder() 100 101 # Download data 102 training_data_1_url = "https://www.batadal.net/data/BATADAL_dataset03.csv" 103 training_data_2_url = "https://www.batadal.net/data/BATADAL_dataset04.csv" 104 test_data_url = "https://www.batadal.net/data/BATADAL_test_dataset.zip" 105 106 training_data_1_path = os.path.join(download_dir, "BATADAL_dataset03.csv") 107 training_data_2_path = os.path.join(download_dir, "BATADAL_dataset04.csv") 108 109 download_if_necessary(training_data_1_path, training_data_1_url, verbose) 110 download_if_necessary(training_data_2_path, training_data_2_url, verbose) 111 112 download_if_necessary(os.path.join(download_dir, "BATADAL_test_dataset.zip"), 113 test_data_url, verbose) 114 unpack_zip_archive(os.path.join(download_dir, "BATADAL_test_dataset.zip"), download_dir) 115 116 # Load and return data 117 df_train_1 = pd.read_csv(training_data_1_path) 118 df_train_2 = pd.read_csv(training_data_2_path) 119 df_test = pd.read_csv(os.path.join(download_dir, "BATADAL_test_dataset.csv")) 120 121 if return_X_y is True: 122 # Convert data to numpy 123 y_train_1 = df_train_1["ATT_FLAG"].to_numpy().astype(np.int8) 124 del df_train_1["ATT_FLAG"] 125 del df_train_1["DATETIME"] 126 X_train_1 = df_train_1.to_numpy() 127 128 y_train_2 = df_train_2[" ATT_FLAG"].to_numpy(copy=True) 129 idx = np.argwhere(y_train_2 == -999) 130 y_train_2[idx] = 0 131 y_train_2 = y_train_2.astype(np.int8) 132 del df_train_2[" ATT_FLAG"] 133 del df_train_2["DATETIME"] 134 X_train_2 = df_train_2.to_numpy() 135 136 del df_test["DATETIME"] 137 X_test = df_test.to_numpy() 138 139 # Create ground truth labels 140 hydraulic_time_step = to_seconds(minutes=15) 141 training_data_2_events_time = __parse_attacks_time(TRAINING_DATA_2_START_TIME, 142 TRAINING_DATA_2_ATTACKS_TIME) 143 test_data_events_time = __parse_attacks_time(TEST_DATA_START_TIME, TEST_DATA_ATTACKS_TIME) 144 145 y_train_2_truth = np.zeros(X_train_2.shape[0]) 146 for event_start, event_end in training_data_2_events_time: 147 t0 = int(event_start / hydraulic_time_step) 148 t1 = int(event_end / hydraulic_time_step) 149 y_train_2_truth[t0:t1] = 1 150 151 y_test_truth = np.zeros(X_test.shape[0]) 152 for event_start, event_end in test_data_events_time: 153 t0 = int(event_start / hydraulic_time_step) 154 t1 = int(event_end / hydraulic_time_step) 155 y_test_truth[t0:t1] = 1 156 157 # Create features' descriptions 158 features_desc = list(df_train_1.columns) 159 desc_mapping = {"PU": "Pump", "V": "Valve", "T": "Tank", "L": "Level", "S": "State", 160 "P": "Pressure", "F": "Flow"} 161 for i, f_desc in enumerate(features_desc): 162 pump = False 163 for k, value in desc_mapping.items(): 164 if k in f_desc: 165 if k == "P" and pump is True: 166 continue 167 f_desc = f_desc.replace(k, value) 168 if k == "PU": 169 pump = True 170 features_desc[i] = f_desc 171 172 # Create final results 173 r = {"train_1": (X_train_1, y_train_1), "train_2": (X_train_2, y_train_2), 174 "test": X_test} 175 176 if return_ground_truth is True: 177 r["train_1"] = (r["train_1"][0], r["train_1"][1], y_train_1) 178 r["train_2"] = (r["train_2"][0], r["train_2"][1], y_train_2_truth) 179 r["test"] = (r["test"][0], y_test_truth) 180 181 if return_features_desc is True: 182 r["features_desc"] = features_desc 183 184 return r 185 else: 186 return {"train_1": df_train_1, "train_2": df_train_2, "test": df_test}
187 188
[docs] 189def load_scada_data(download_dir: str = None, return_X_y: bool = False, 190 return_ground_truth: bool = False, return_features_desc: bool = False, 191 verbose: bool = True) -> Any: 192 """ 193 Loads the SCADA data of the simulated BATADAL benchmark scenario -- note that due to 194 randomness and undocumented aspects of the original BATADAL data set, these differ from 195 the original data set which can be loaded by calling 196 :func:`~epyt_flow.data.benchmarks.batadal.load_data`. 197 198 Parameters 199 ---------- 200 download_dir : `str`, optional 201 Path to the data files -- if None, the temp folder will be used. 202 If the path does not exist, the data files will be downloaded to the given path. 203 204 The default is None. 205 return_X_y : `bool`, optional 206 If True, the data together with the labels is returned as pairs of Numpy arrays. 207 Otherwisen the data is returned as Pandas data frames. 208 209 The default is False. 210 return_ground_truth : `bool` 211 If True and if `return_X_y` is True, the ground truth labels are included in the 212 returned dictionary -- note that the labels provided in the benchmark constitute 213 a partial labeling only. 214 215 The default is False. 216 return_features_desc : `bool` 217 If True and if `return_X_y` is True, feature names (i.e. descriptions) are included 218 in the returned dictionary. 219 220 The default is False. 221 verbose : `bool`, optional 222 If True, a progress bar is shown while downloading files. 223 224 The default is True. 225 """ 226 raise NotImplementedError()
227 228
[docs] 229def load_scenario(download_dir: str = None, verbose: bool = True) -> ScenarioConfig: 230 """ 231 Creates and returns the BATADAL scenario -- it can be either modified or directly passed 232 to the simulator :class:`~epyt_flow.simulation.scenario_simulator.ScenarioSimulator`. 233 234 .. note:: 235 236 Note that due to randomness and undocumented aspects of the original BATADAL benchmark, 237 the scenario simulation results differ from the original data set which can be loaded by 238 calling :func:`~epyt_flow.data.benchmarks.batadal.load_data`. 239 240 Parameters 241 ---------- 242 download_dir : `str`, optional 243 Path to the data files -- if None, the temp folder will be used. 244 If the path does not exist, the data files will be downloaded to the given path. 245 246 The default is None. 247 verbose : `bool`, optional 248 If True, a progress bar is shown while downloading files. 249 250 The default is True. 251 252 Returns 253 ------- 254 :class:`~epyt_flow.simulation.scenario_config.ScenarioConfig` 255 The BATADAL scenario. 256 """ 257 raise NotImplementedError()