Source code for epyt_flow.data.benchmarks.water_usage

  1"""
  2Module provides a function for loading the water usage data set by P. Pavlou et al.
  3"""
  4import os
  5import numpy as np
  6import pandas as pd
  7from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score
  8
  9from ...utils import get_temp_folder, download_if_necessary
 10
 11
[docs] 12def compute_evaluation_score(y_pred: np.ndarray, y: np.ndarray) -> dict: 13 """ 14 Evaluates the performance of a detection method. 15 16 Note that instead of a single metric, the following set of metrics is used: 17 - Accuracy 18 - Precision 19 - F1-score (using "micro" averaging) 20 - Cohen's kappa 21 - ROC AUC 22 23 Parameters 24 ---------- 25 y_pred : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_ 26 Event indication prediction over time 27 y : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_ 28 Ground truth event indication over time. 29 30 Returns 31 ------- 32 `dict` 33 All evaluation scores. 34 """ 35 return {"accuracy": accuracy_score(y, y_pred), 36 "precision": precision_score(y, y_pred, average="weighted"), 37 "f1-micro": f1_score(y, y_pred, average="micro"), "roc-auc": roc_auc_score(y, y_pred)}
38 39
[docs] 40def load_water_usage(download_dir: str = None, return_X_y: bool = True, verbose: bool = True) -> dict: 41 """ 42 "Monitoring domestic water consumption: A comparative study of model-based and data-driven 43 end-use disaggregation methods" by P. Pavlou, S. Filippou, S. Solonos, S. G. Vrachimis, 44 K. Malialis, D. G. Eliades, T. Theocharides, M. M. Polycarpou is a benchmark concerning the 45 monitoring of water usage of different household appliances. Informing consumers about it has 46 been shown to have an impact on their behavior toward drinking water conservation. The data 47 were created using the STochastic Residential water End-use Model (STREaM) 48 (Cominola et al., 2018), a modelling software developed that generates synthetic time series 49 data of a household. 50 51 This benchmark data set is for identifying active appliances from the aggregated water 52 consumption -- i.e. a multi-class classification probelm. The data set considers the use 53 of standard toilet, standard shower, standard faucet, high efficiency clothes washer, 54 and standard dishwasher in a 2-person household for a period of 180 days (6 months) and 55 it has a resolution of 10s. 56 The data set is already split into 3 sub-sets for training (90 days), validation (45 days), 57 and testing (45 days). 58 59 For more information see https://github.com/KIOS-Research/Water-Usage-Dataset/ 60 61 .. note:: 62 63 Note that although this data set is synthetic, only the final data set is provided. 64 65 Parameters 66 ---------- 67 download_dir : `str`, optional 68 Path to the data files -- if None, the temp folder will be used. 69 If the path does not exist, the data files will be downloaded to the given path. 70 71 The default is None. 72 return_X_y : `bool`, optional 73 If True, the data is returned together with the multi-class labels as two Numpy arrays, 74 otherwise, the data is returned as Pandas data frame. 75 76 The default is True. 77 verbose : `bool`, optional 78 If True, a progress bar is shown while downloading files. 79 80 The default is True. 81 82 Returns 83 ------- 84 `dict` 85 The data set as a dictionary with entries "train", "validation", and "test" containing 86 the respective data. 87 """ 88 # Download data if necessary 89 download_dir = download_dir if download_dir is not None else get_temp_folder() 90 91 base_url = "https://github.com/KIOS-Research/Water-Usage-Dataset/raw/main/Dataset/" 92 url_train_data = base_url + "Trainset.csv" 93 url_valid_data = base_url + "Validationset.csv" 94 url_test_data = base_url + "Testset.csv" 95 96 f_train_in = os.path.join(download_dir, "train_water_usage.csv") 97 f_valid_in = os.path.join(download_dir, "valid_water_usage.csv") 98 f_test_in = os.path.join(download_dir, "test_water_usage.csv") 99 100 download_if_necessary(f_train_in, url_train_data, verbose) 101 download_if_necessary(f_valid_in, url_valid_data, verbose) 102 download_if_necessary(f_test_in, url_test_data, verbose) 103 104 # Load and return data 105 df_data_train = pd.read_csv(f_train_in) 106 df_data_valid = pd.read_csv(f_valid_in) 107 df_data_test = pd.read_csv(f_test_in) 108 109 if return_X_y is False: 110 return {"train": df_data_train, "validation": df_data_valid, "test": df_data_test} 111 else: 112 r = {"train": None, "validation": None, "test": None} 113 114 for k, df_data in zip(["train", "validation", "test"], 115 [df_data_train, df_data_valid, df_data_test]): 116 X = df_data["TOTAL"].to_numpy() 117 del df_data["TOTAL"] 118 119 y = df_data.to_numpy() 120 y = (y != 0).astype(np.int8) 121 122 r[k] = (X, y) 123 124 return r