Source code for epyt_flow.data.benchmarks.water_usage

  1"""
  2Module provides a function for loading the water usage data set by P. Pavlou et al.
  3"""
  4import os
  5import numpy as np
  6import pandas as pd
  7from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score
  8
  9from ...utils import get_temp_folder, download_if_necessary
 10
 11

[docs]
 12def compute_evaluation_score(y_pred: np.ndarray, y: np.ndarray) -> dict:
 13    """
 14    Evaluates the performance of a detection method.
 15
 16    Note that instead of a single metric, the following set of metrics is used:
 17        - Accuracy
 18        - Precision
 19        - F1-score (using "micro" averaging)
 20        - Cohen's kappa
 21        - ROC AUC
 22
 23    Parameters
 24    ----------
 25    y_pred : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
 26        Event indication prediction over time
 27    y : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
 28        Ground truth event indication over time.
 29
 30    Returns
 31    -------
 32    `dict`
 33        All evaluation scores.
 34    """
 35    return {"accuracy": accuracy_score(y, y_pred),
 36            "precision": precision_score(y, y_pred, average="weighted"),
 37            "f1-micro": f1_score(y, y_pred, average="micro"), "roc-auc": roc_auc_score(y, y_pred)}

 38
 39

[docs]
 40def load_water_usage(download_dir: str = None, return_X_y: bool = True, verbose: bool = True) -> dict:
 41    """
 42    "Monitoring domestic water consumption: A comparative study of model-based and data-driven
 43    end-use disaggregation methods" by P. Pavlou, S. Filippou, S. Solonos, S. G. Vrachimis,
 44    K. Malialis, D. G. Eliades, T. Theocharides, M. M. Polycarpou is a benchmark concerning the
 45    monitoring of water usage of different household appliances. Informing consumers about it has
 46    been shown to have an impact on their behavior toward drinking water conservation. The data
 47    were created using the STochastic Residential water End-use Model (STREaM)
 48    (Cominola et al., 2018), a modelling software developed that generates synthetic time series
 49    data of a household.
 50
 51    This benchmark data set is for identifying active appliances from the aggregated water
 52    consumption -- i.e. a multi-class classification probelm. The data set considers the use
 53    of standard toilet, standard shower, standard faucet, high efficiency clothes washer,
 54    and standard dishwasher in a 2-person household for a period of 180 days (6 months) and
 55    it has a resolution of 10s.
 56    The data set is already split into 3 sub-sets for training (90 days), validation (45 days),
 57    and testing (45 days).
 58
 59    For more information see https://github.com/KIOS-Research/Water-Usage-Dataset/
 60
 61    .. note::
 62
 63        Note that although this data set is synthetic, only the final data set is provided.
 64
 65    Parameters
 66    ----------
 67    download_dir : `str`, optional
 68        Path to the data files -- if None, the temp folder will be used.
 69        If the path does not exist, the data files will be downloaded to the given path.
 70
 71        The default is None.
 72    return_X_y : `bool`, optional
 73        If True, the data is returned together with the multi-class labels as two Numpy arrays,
 74        otherwise, the data is returned as Pandas data frame.
 75
 76        The default is True.
 77    verbose : `bool`, optional
 78        If True, a progress bar is shown while downloading files.
 79
 80        The default is True.
 81
 82    Returns
 83    -------
 84    `dict`
 85        The data set as a dictionary with entries "train", "validation", and "test" containing
 86        the respective data.
 87    """
 88    # Download data if necessary
 89    download_dir = download_dir if download_dir is not None else get_temp_folder()
 90
 91    base_url = "https://github.com/KIOS-Research/Water-Usage-Dataset/raw/main/Dataset/"
 92    url_train_data = base_url + "Trainset.csv"
 93    url_valid_data = base_url + "Validationset.csv"
 94    url_test_data = base_url + "Testset.csv"
 95
 96    f_train_in = os.path.join(download_dir, "train_water_usage.csv")
 97    f_valid_in = os.path.join(download_dir, "valid_water_usage.csv")
 98    f_test_in = os.path.join(download_dir, "test_water_usage.csv")
 99
100    download_if_necessary(f_train_in, url_train_data, verbose)
101    download_if_necessary(f_valid_in, url_valid_data, verbose)
102    download_if_necessary(f_test_in, url_test_data, verbose)
103
104    # Load and return data
105    df_data_train = pd.read_csv(f_train_in)
106    df_data_valid = pd.read_csv(f_valid_in)
107    df_data_test = pd.read_csv(f_test_in)
108
109    if return_X_y is False:
110        return {"train": df_data_train, "validation": df_data_valid, "test": df_data_test}
111    else:
112        r = {"train": None, "validation": None, "test": None}
113
114        for k, df_data in zip(["train", "validation", "test"],
115                              [df_data_train, df_data_valid, df_data_test]):
116            X = df_data["TOTAL"].to_numpy()
117            del df_data["TOTAL"]
118
119            y = df_data.to_numpy()
120            y = (y != 0).astype(np.int8)
121
122            r[k] = (X, y)
123
124        return r