Source code for epyt_flow.data.benchmarks.batadal

  1"""
  2The BATtle of the Attack Detection ALgorithms (*BATADAL*) by Riccardo Taormina, Stefano Galelli,
  3Nils Ole Tippenhauer, Avi Ostfeld, Elad Salomons, Demetrios Eliades is a competition on planning
  4and management of water networks undertaken within the Water Distribution Systems Analysis
  5Symposium. The goal of the battle was to compare the performance of algorithms for the detection
  6of cyber-physical attacks, whose frequency has increased in the last few years along with the
  7adoption of smart water technologies. The design challenge was set for the C-Town network,
  8a real-world, medium-sized water distribution system operated through programmable logic
  9controllers and a supervisory control and data acquisition (SCADA) system. Participants were
 10provided with data sets containing (simulated) SCADA observations, and challenged to design
 11an attack detection algorithm. The effectiveness of all submitted algorithms was evaluated in
 12terms of time-to-detection and classification accuracy. Seven teams participated in the battle
 13and proposed a variety of successful approaches leveraging data analysis, model-based detection
 14mechanisms, and rule checking. Results were presented at the Water Distribution Systems Analysis
 15Symposium (World Environmental and Water Resources Congress) in Sacramento, California on
 16May 21-25, 2017.
 17The `paper <https://doi.org/10.1061/(ASCE)WR.1943-5452.0000969>`_ summarizes the BATADAL
 18problem, proposed algorithms, results, and future research directions.
 19
 20See https://www.batadal.net/ for details.
 21
 22This module provides functions for loading the original BATADAL data set
 23:func:`~epyt_flow.data.benchmarks.batadal.load_data`, as well as functions for loading the
 24scenarios :func:`~epyt_flow.data.benchmarks.batadal.load_scenario` and pre-generated
 25SCADA data :func:`~epyt_flow.data.benchmarks.batadal.load_scada_data`.
 26"""
 27import os
 28from typing import Any
 29from datetime import datetime
 30import pandas as pd
 31import numpy as np
 32
 33from .batadal_data import TRAINING_DATA_2_ATTACKS_TIME, TRAINING_DATA_2_START_TIME, \
 34    TEST_DATA_ATTACKS_TIME, TEST_DATA_START_TIME
 35from ...utils import get_temp_folder, unpack_zip_archive, to_seconds, download_if_necessary
 36from ...simulation import ScenarioConfig
 37
 38
 39def __parse_attacks_time(start_time: str, attacks_time):
 40    events = []
 41    for event in attacks_time.splitlines():
 42        # Parse entry
 43        items = [i.strip() for i in event.split(",")]
 44
 45        event_start_time = int((datetime.strptime(items[0], "%d/%m/%Y %H:%M") - start_time)
 46                               .total_seconds())
 47        event_end_time = int((datetime.strptime(items[1], "%d/%m/%Y %H:%M") - start_time)
 48                             .total_seconds())
 49
 50        events.append((event_start_time, event_end_time))
 51
 52    return events
 53
 54

[docs]
 55def load_data(download_dir: str = None, return_X_y: bool = False,
 56              return_ground_truth: bool = False, return_features_desc: bool = False,
 57              verbose: bool = True) -> dict:
 58    """
 59    Loads the original BATADAL competition data.
 60
 61    Parameters
 62    ----------
 63    download_dir : `str`, optional
 64        Path to the data files -- if None, the temp folder will be used.
 65        If the path does not exist, the data files will be downloaded to the given path.
 66
 67        The default is None.
 68    return_X_y : `bool`, optional
 69        If True, the data together with the labels is returned as pairs of Numpy arrays.
 70        Otherwise, the data is returned as Pandas data frames.
 71
 72        The default is False.
 73    return_ground_truth : `bool`
 74        If True and if `return_X_y` is True, the ground truth labels are included in the
 75        returned dictionary -- note that the labels provided in the benchmark constitute
 76        a partial labeling only.
 77
 78        The default is False.
 79    return_features_desc : `bool`
 80        If True and if `return_X_y` is True, feature names (i.e. descriptions) are included
 81        in the returned dictionary.
 82
 83        The default is False.
 84    verbose : `bool`, optional
 85        If True, a progress bar is shown while downloading files.
 86
 87        The default is True.
 88
 89    Returns
 90    -------
 91    `dict`
 92        Dictionary of the loaded benchmark data. The dictionary contains the two training
 93        data sets ("train_1" and "train_2"), as well as the test data set ("test").
 94        If `return_X_y` is False, each dictionary entry is a `Pandas dataframe <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_.
 95        Otherwise, it is a tuple of sensor readings and labels (except for the test set) --
 96        if `return_ground_truth` is True or `return_features_desc` is True, the corresponding
 97        data is appended to the tuple.
 98    """
 99    download_dir = download_dir if download_dir is not None else get_temp_folder()
100
101    # Download data
102    training_data_1_url = "https://www.batadal.net/data/BATADAL_dataset03.csv"
103    training_data_2_url = "https://www.batadal.net/data/BATADAL_dataset04.csv"
104    test_data_url = "https://www.batadal.net/data/BATADAL_test_dataset.zip"
105
106    training_data_1_path = os.path.join(download_dir, "BATADAL_dataset03.csv")
107    training_data_2_path = os.path.join(download_dir, "BATADAL_dataset04.csv")
108
109    download_if_necessary(training_data_1_path, training_data_1_url, verbose)
110    download_if_necessary(training_data_2_path, training_data_2_url, verbose)
111
112    download_if_necessary(os.path.join(download_dir, "BATADAL_test_dataset.zip"),
113                          test_data_url, verbose)
114    unpack_zip_archive(os.path.join(download_dir, "BATADAL_test_dataset.zip"), download_dir)
115
116    # Load and return data
117    df_train_1 = pd.read_csv(training_data_1_path)
118    df_train_2 = pd.read_csv(training_data_2_path)
119    df_test = pd.read_csv(os.path.join(download_dir, "BATADAL_test_dataset.csv"))
120
121    if return_X_y is True:
122        # Convert data to numpy
123        y_train_1 = df_train_1["ATT_FLAG"].to_numpy().astype(np.int8)
124        del df_train_1["ATT_FLAG"]
125        del df_train_1["DATETIME"]
126        X_train_1 = df_train_1.to_numpy()
127
128        y_train_2 = df_train_2[" ATT_FLAG"].to_numpy(copy=True)
129        idx = np.argwhere(y_train_2 == -999)
130        y_train_2[idx] = 0
131        y_train_2 = y_train_2.astype(np.int8)
132        del df_train_2[" ATT_FLAG"]
133        del df_train_2["DATETIME"]
134        X_train_2 = df_train_2.to_numpy()
135
136        del df_test["DATETIME"]
137        X_test = df_test.to_numpy()
138
139        # Create ground truth labels
140        hydraulic_time_step = to_seconds(minutes=15)
141        training_data_2_events_time = __parse_attacks_time(TRAINING_DATA_2_START_TIME,
142                                                           TRAINING_DATA_2_ATTACKS_TIME)
143        test_data_events_time = __parse_attacks_time(TEST_DATA_START_TIME, TEST_DATA_ATTACKS_TIME)
144
145        y_train_2_truth = np.zeros(X_train_2.shape[0])
146        for event_start, event_end in training_data_2_events_time:
147            t0 = int(event_start / hydraulic_time_step)
148            t1 = int(event_end / hydraulic_time_step)
149            y_train_2_truth[t0:t1] = 1
150
151        y_test_truth = np.zeros(X_test.shape[0])
152        for event_start, event_end in test_data_events_time:
153            t0 = int(event_start / hydraulic_time_step)
154            t1 = int(event_end / hydraulic_time_step)
155            y_test_truth[t0:t1] = 1
156
157        # Create features' descriptions
158        features_desc = list(df_train_1.columns)
159        desc_mapping = {"PU": "Pump", "V": "Valve", "T": "Tank", "L": "Level", "S": "State",
160                        "P": "Pressure", "F": "Flow"}
161        for i, f_desc in enumerate(features_desc):
162            pump = False
163            for k, value in desc_mapping.items():
164                if k in f_desc:
165                    if k == "P" and pump is True:
166                        continue
167                    f_desc = f_desc.replace(k, value)
168                    if k == "PU":
169                        pump = True
170            features_desc[i] = f_desc
171
172        # Create final results
173        r = {"train_1": (X_train_1, y_train_1), "train_2": (X_train_2, y_train_2),
174             "test": X_test}
175
176        if return_ground_truth is True:
177            r["train_1"] = (r["train_1"][0], r["train_1"][1], y_train_1)
178            r["train_2"] = (r["train_2"][0], r["train_2"][1], y_train_2_truth)
179            r["test"] = (r["test"][0], y_test_truth)
180
181        if return_features_desc is True:
182            r["features_desc"] = features_desc
183
184        return r
185    else:
186        return {"train_1": df_train_1, "train_2": df_train_2, "test": df_test}

187
188

[docs]
189def load_scada_data(download_dir: str = None, return_X_y: bool = False,
190                    return_ground_truth: bool = False, return_features_desc: bool = False,
191                    verbose: bool = True) -> Any:
192    """
193    Loads the SCADA data of the simulated BATADAL benchmark scenario -- note that due to
194    randomness and undocumented aspects of the original BATADAL data set, these differ from
195    the original data set which can be loaded by calling
196    :func:`~epyt_flow.data.benchmarks.batadal.load_data`.
197
198    Parameters
199    ----------
200    download_dir : `str`, optional
201        Path to the data files -- if None, the temp folder will be used.
202        If the path does not exist, the data files will be downloaded to the given path.
203
204        The default is None.
205    return_X_y : `bool`, optional
206        If True, the data together with the labels is returned as pairs of Numpy arrays.
207        Otherwisen the data is returned as Pandas data frames.
208
209        The default is False.
210    return_ground_truth : `bool`
211        If True and if `return_X_y` is True, the ground truth labels are included in the
212        returned dictionary -- note that the labels provided in the benchmark constitute
213        a partial labeling only.
214
215        The default is False.
216    return_features_desc : `bool`
217        If True and if `return_X_y` is True, feature names (i.e. descriptions) are included
218        in the returned dictionary.
219
220        The default is False.
221    verbose : `bool`, optional
222        If True, a progress bar is shown while downloading files.
223
224        The default is True.
225    """
226    raise NotImplementedError()

227
228

[docs]
229def load_scenario(download_dir: str = None, verbose: bool = True) -> ScenarioConfig:
230    """
231    Creates and returns the BATADAL scenario -- it can be either modified or directly passed
232    to the simulator :class:`~epyt_flow.simulation.scenario_simulator.ScenarioSimulator`.
233
234    .. note::
235
236        Note that due to randomness and undocumented aspects of the original BATADAL benchmark,
237        the scenario simulation results differ from the original data set which can be loaded by
238        calling :func:`~epyt_flow.data.benchmarks.batadal.load_data`.
239
240    Parameters
241    ----------
242    download_dir : `str`, optional
243        Path to the data files -- if None, the temp folder will be used.
244        If the path does not exist, the data files will be downloaded to the given path.
245
246        The default is None.
247    verbose : `bool`, optional
248        If True, a progress bar is shown while downloading files.
249
250        The default is True.
251
252    Returns
253    -------
254    :class:`~epyt_flow.simulation.scenario_config.ScenarioConfig`
255        The BATADAL scenario.
256    """
257    raise NotImplementedError()