1"""
2Module provides a function for loading the water usage data set by P. Pavlou et al.
3"""
4import os
5import numpy as np
6import pandas as pd
7from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score
8
9from ...utils import get_temp_folder, download_if_necessary
10
11
[docs]
12def compute_evaluation_score(y_pred: np.ndarray, y: np.ndarray) -> dict:
13 """
14 Evaluates the performance of a detection method.
15
16 Note that instead of a single metric, the following set of metrics is used:
17 - Accuracy
18 - Precision
19 - F1-score (using "micro" averaging)
20 - Cohen's kappa
21 - ROC AUC
22
23 Parameters
24 ----------
25 y_pred : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
26 Event indication prediction over time
27 y : `numpy.ndarray <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html>`_
28 Ground truth event indication over time.
29
30 Returns
31 -------
32 `dict`
33 All evaluation scores.
34 """
35 return {"accuracy": accuracy_score(y, y_pred),
36 "precision": precision_score(y, y_pred, average="weighted"),
37 "f1-micro": f1_score(y, y_pred, average="micro"), "roc-auc": roc_auc_score(y, y_pred)}
38
39
[docs]
40def load_water_usage(download_dir: str = None, return_X_y: bool = True, verbose: bool = True) -> dict:
41 """
42 "Monitoring domestic water consumption: A comparative study of model-based and data-driven
43 end-use disaggregation methods" by P. Pavlou, S. Filippou, S. Solonos, S. G. Vrachimis,
44 K. Malialis, D. G. Eliades, T. Theocharides, M. M. Polycarpou is a benchmark concerning the
45 monitoring of water usage of different household appliances. Informing consumers about it has
46 been shown to have an impact on their behavior toward drinking water conservation. The data
47 were created using the STochastic Residential water End-use Model (STREaM)
48 (Cominola et al., 2018), a modelling software developed that generates synthetic time series
49 data of a household.
50
51 This benchmark data set is for identifying active appliances from the aggregated water
52 consumption -- i.e. a multi-class classification probelm. The data set considers the use
53 of standard toilet, standard shower, standard faucet, high efficiency clothes washer,
54 and standard dishwasher in a 2-person household for a period of 180 days (6 months) and
55 it has a resolution of 10s.
56 The data set is already split into 3 sub-sets for training (90 days), validation (45 days),
57 and testing (45 days).
58
59 For more information see https://github.com/KIOS-Research/Water-Usage-Dataset/
60
61 .. note::
62
63 Note that although this data set is synthetic, only the final data set is provided.
64
65 Parameters
66 ----------
67 download_dir : `str`, optional
68 Path to the data files -- if None, the temp folder will be used.
69 If the path does not exist, the data files will be downloaded to the given path.
70
71 The default is None.
72 return_X_y : `bool`, optional
73 If True, the data is returned together with the multi-class labels as two Numpy arrays,
74 otherwise, the data is returned as Pandas data frame.
75
76 The default is True.
77 verbose : `bool`, optional
78 If True, a progress bar is shown while downloading files.
79
80 The default is True.
81
82 Returns
83 -------
84 `dict`
85 The data set as a dictionary with entries "train", "validation", and "test" containing
86 the respective data.
87 """
88 # Download data if necessary
89 download_dir = download_dir if download_dir is not None else get_temp_folder()
90
91 base_url = "https://github.com/KIOS-Research/Water-Usage-Dataset/raw/main/Dataset/"
92 url_train_data = base_url + "Trainset.csv"
93 url_valid_data = base_url + "Validationset.csv"
94 url_test_data = base_url + "Testset.csv"
95
96 f_train_in = os.path.join(download_dir, "train_water_usage.csv")
97 f_valid_in = os.path.join(download_dir, "valid_water_usage.csv")
98 f_test_in = os.path.join(download_dir, "test_water_usage.csv")
99
100 download_if_necessary(f_train_in, url_train_data, verbose)
101 download_if_necessary(f_valid_in, url_valid_data, verbose)
102 download_if_necessary(f_test_in, url_test_data, verbose)
103
104 # Load and return data
105 df_data_train = pd.read_csv(f_train_in)
106 df_data_valid = pd.read_csv(f_valid_in)
107 df_data_test = pd.read_csv(f_test_in)
108
109 if return_X_y is False:
110 return {"train": df_data_train, "validation": df_data_valid, "test": df_data_test}
111 else:
112 r = {"train": None, "validation": None, "test": None}
113
114 for k, df_data in zip(["train", "validation", "test"],
115 [df_data_train, df_data_valid, df_data_test]):
116 X = df_data["TOTAL"].to_numpy()
117 del df_data["TOTAL"]
118
119 y = df_data.to_numpy()
120 y = (y != 0).astype(np.int8)
121
122 r[k] = (X, y)
123
124 return r