Coverage for biobb_ml/resampling/undersampling.py: 78%

1#!/usr/bin/env python3

3"""Module containing the Undersampling class and the command line interface."""

4import argparse

5import pandas as pd

6import numpy as np

7from collections import Counter

8from biobb_common.generic.biobb_object import BiobbObject

9from sklearn import preprocessing

10from sklearn.model_selection import cross_val_score

11from sklearn.model_selection import RepeatedStratifiedKFold

12from sklearn.ensemble import RandomForestClassifier

13from biobb_ml.resampling.reg_resampler import resampler

14from biobb_common.configuration import settings

15from biobb_common.tools import file_utils as fu

16from biobb_common.tools.file_utils import launchlogger

17from biobb_ml.resampling.common import check_input_path, check_output_path, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, getResamplingMethod, undersampling_methods

20class Undersampling(BiobbObject):

21 """

22 | biobb_ml Undersampling

23 | Wrapper of most of the imblearn.under_sampling methods.

24 | Remove samples from the majority class of a given dataset, with or without replacement. If regression is specified as type, the data will be resampled to classes in order to apply the undersampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_, `NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_, `CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_, `TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_, `EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_, `NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_, `ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_.

26 Args:

27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752).

28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_undersampling.csv>`_. Accepted formats: csv (edam:format_3752).

29 properties (dic - Python dictionary object containing the tool parameters, not input/output files):

30 * **method** (*str*) - (None) Undersampling method. It's a mandatory property. Values: random (`RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_: Under-sample the majority classes by randomly picking samples with or without replacement), nearmiss (`NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_: Class to perform under-sampling based on NearMiss methods), cnn (`CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_: Class to perform under-sampling based on the condensed nearest neighbour method), tomeklinks (`TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_: Class to perform under-sampling by removing Tomek's links), enn (`EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_: Class to perform under-sampling based on the edited nearest neighbour method), ncr (`NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_: Class performing under-sampling based on the neighbourhood cleaning rule), cluster (`ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_: Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm).

31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset).

32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.

33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling.

34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2.

35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated.

36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression undersampling. The number of classes that the user wants to generate with the target data.

37 * **balanced_binning** (*bool*) - (False) Only for regression undersampling. Decides whether samples are to be distributed roughly equally across all classes.

38 * **sampling_strategy** (*dict*) - ({ "target": "auto" }) Sampling information to sample the data set. Formats: { "target": "auto" }, { "ratio": 0.3 }, { "dict": { 0: 300, 1: 200, 2: 100 } } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes, the values correspond to the desired number of samples for each targeted class. When "list", the list contains the classes targeted by the resampling.

39 * **version** (*int*) - (1) Only for NearMiss method. Version of the NearMiss to use. Values: 1 (selects samples of the majority class that their average distances to three closest instances of the minority class are the smallest), 2 (uses three farthest samples of the minority class), 3 (selects a given number of the closest samples of the majority class for each sample of the minority class).

40 * **n_neighbors** (*int*) - (1) [1~100|1] Only for NearMiss, CondensedNearestNeighbour, EditedNearestNeighbours and NeighbourhoodCleaningRule methods. Size of the neighbourhood to consider to compute the average distance to the minority point samples.

41 * **threshold_cleaning** (*float*) - (0.5) [0~1|0.1] Only for NeighbourhoodCleaningRule method. Threshold used to whether consider a class or not during the cleaning after applying ENN.

42 * **random_state_method** (*int*) - (5) [1~1000|1] Only for RandomUnderSampler and ClusterCentroids methods. Controls the randomization of the algorithm.

43 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method.

44 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.

45 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

46 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.

48 Examples:

49 This is a use example of how to use the building block from Python::

51 from biobb_ml.resampling.undersampling import undersampling

52 prop = {

53 'method': 'enn',

54 'type': 'regression',

55 'target': {

56 'column': 'target'

57 },

58 'evaluate': true,

59 'n_bins': 10,

60 'n_neighbors': 3,

61 'sampling_strategy': {

62 'target': 'auto'

63 }

64 }

65 undersampling(input_dataset_path='/path/to/myDataset.csv',

66 output_dataset_path='/path/to/newDataset.csv',

67 properties=prop)

69 Info:

70 * wrapped_software:

71 * name: imbalanced-learn under_sampling

72 * version: >0.7.0

73 * license: MIT

74 * ontology:

75 * name: EDAM

76 * schema: http://edamontology.org/EDAM.owl

78 """

80 def __init__(self, input_dataset_path, output_dataset_path,

81 properties=None, **kwargs) -> None:

82 properties = properties or {}

84 # Call parent class constructor

85 super().__init__(properties)

86 self.locals_var_dict = locals().copy()

88 # Input/Output files

89 self.io_dict = {

90 "in": {"input_dataset_path": input_dataset_path},

91 "out": {"output_dataset_path": output_dataset_path}

92 }

94 # Properties specific for BB

95 self.method = properties.get('method', None)

96 self.type = properties.get('type', None)

97 self.target = properties.get('target', {})

98 self.evaluate = properties.get('evaluate', False)

99 self.evaluate_splits = properties.get('evaluate_splits', 3)

100 self.evaluate_repeats = properties.get('evaluate_repeats', 3)

101 self.n_bins = properties.get('n_bins', 5)

102 self.balanced_binning = properties.get('balanced_binning', False)

103 self.sampling_strategy = properties.get('sampling_strategy', {'target': 'auto'})

104 self.version = properties.get('version', 1)

105 self.n_neighbors = properties.get('n_neighbors', 1)

106 self.threshold_cleaning = properties.get('threshold_cleaning', 1)

107 self.random_state_method = properties.get('random_state_method', 5)

108 self.random_state_evaluate = properties.get('random_state_evaluate', 5)

109 self.properties = properties

110

111 # Check the properties

112 self.check_properties(properties)

113 self.check_arguments()

114

115 def check_data_params(self, out_log, err_log):

116 """ Checks all the input/output paths and parameters """

117 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)

118 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__)

119

120 @launchlogger

121 def launch(self) -> int:

122 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` resampling.undersampling.Undersampling object."""

123

124 # check input/output paths and parameters

125 self.check_data_params(self.out_log, self.err_log)

126

127 # Setup Biobb

128 if self.check_restart():

129 return 0

130 self.stage_files()

131

132 # check mandatory properties

133 method = getResamplingMethod(self.method, 'undersampling', self.out_log, self.__class__.__name__)

134 checkResamplingType(self.type, self.out_log, self.__class__.__name__)

135 sampling_strategy = getSamplingStrategy(self.sampling_strategy, self.out_log, self.__class__.__name__)

136

137 # load dataset

138 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)

139 if 'column' in self.target:

140 labels = getHeader(self.io_dict["in"]["input_dataset_path"])

141 skiprows = 1

142 header = 0

143 else:

144 labels = None

145 skiprows = None

146 header = None

147 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)

148

149 train_df = data

150 ranges = None

151

152 le = preprocessing.LabelEncoder()

153

154 cols_encoded = []

155 for column in train_df:

156 # if type object, LabelEncoder.fit_transform

157 if train_df[column].dtypes == 'object':

158 cols_encoded.append(column)

159 train_df[column] = le.fit_transform(train_df[column])

160

161 # defining X

162 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)]

163 # calling undersample method

164 if self.method == 'random':

165 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method)

166 elif self.method == 'nearmiss':

167 if self.version == 3:

168 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors_ver3=self.n_neighbors)

169 else:

170 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors=self.n_neighbors)

171 elif self.method == 'cnn':

172 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors)

173 elif self.method == 'tomeklinks':

174 method = method(sampling_strategy=sampling_strategy)

175 elif self.method == 'enn':

176 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors)

177 elif self.method == 'ncr':

178 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors, threshold_cleaning=self.threshold_cleaning)

179 elif self.method == 'cluster':

180 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method)

181

182 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log)

183

184 # undersampling

185 if self.type == 'regression':

186 fu.log('Undersampling regression dataset, continuous data will be classified', self.out_log, self.global_log)

187 # call resampler class for Regression ReSampling

188 rs = resampler()

189 # Create n_bins classes for the dataset

190 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)

191 # Get the under-sampled data

192 final_X, final_y = rs.resample(method, train_df, y)

193 elif self.type == 'classification':

194 # get X and y

195 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__)

196 # fit and resample

197 final_X, final_y = method.fit_resample(X, y)

198 target_pos = None

199

200 # evaluate undersampling

201 if self.evaluate:

202 fu.log('Evaluating data before undersampling with RandomForestClassifier', self.out_log, self.global_log)

203 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate)

204 # evaluate model

205 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1)

206 if not np.isnan(np.mean(scores)):

207 fu.log('Mean Accuracy before undersampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log)

208 else:

209 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)

210

211 # log distribution before undersampling

212 dist = ''

213 for k, v in Counter(y).items():

214 per = v / len(y) * 100

215 rng = ''

216 if ranges:

217 rng = str(ranges[k])

218 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)

219 fu.log('Classes distribution before undersampling:\n\n%s' % dist, self.out_log, self.global_log)

220

221 # join final_X and final_y in the output dataframe

222 if header is None:

223 # numpy

224 out_df = np.column_stack((final_X, final_y))

225 else:

226 # pandas

227 out_df = final_X.join(final_y)

228

229 # if no header, convert np to pd

230 if header is None:

231 out_df = pd.DataFrame(data=out_df)

232

233 # if cols encoded, decode them

234 if cols_encoded:

235 for column in cols_encoded:

236 if header is None:

237 out_df = out_df.astype({column: int})

238 out_df[column] = le.inverse_transform(out_df[column].values.ravel())

239

240 # if no header, target is in a different column

241 if target_pos:

242 t = target_pos

243 else:

244 t = getTargetValue(self.target, self.out_log, self.__class__.__name__)

245 # log distribution after undersampling

246 if self.type == 'regression':

247 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)

248 elif self.type == 'classification':

249 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__)

250

251 dist = ''

252 for k, v in Counter(y_out).items():

253 per = v / len(y_out) * 100

254 rng = ''

255 if ranges:

256 rng = str(ranges[k])

257 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)

258 fu.log('Classes distribution after undersampling:\n\n%s' % dist, self.out_log, self.global_log)

259

260 # evaluate undersampling

261 if self.evaluate:

262 fu.log('Evaluating data after undersampling with RandomForestClassifier', self.out_log, self.global_log)

263 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)

264 # evaluate model

265 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1)

266 if not np.isnan(np.mean(scores)):

267 fu.log('Mean Accuracy after undersampling a %s dataset with %s method: %.3f' % (self.type, undersampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log)

268 else:

269 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)

270

271 # save output

272 hdr = False

273 if header == 0:

274 hdr = True

275 fu.log('Saving undersampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log)

276 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr)

277

278 # Copy files to host

279 self.copy_to_host()

280

281 self.tmp_files.extend([

282 self.stage_io_dict.get("unique_dir")

283 ])

284 self.remove_tmp_files()

285

286 self.check_arguments(output_files_created=True, raise_exception=False)

287

288 return 0

289

290

291def undersampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int:

292 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` class and

293 execute the :meth:`launch() <resampling.undersampling.Undersampling.launch>` method."""

294

295 return Undersampling(input_dataset_path=input_dataset_path,

296 output_dataset_path=output_dataset_path,

297 properties=properties, **kwargs).launch()

298

299

300def main():

301 """Command line execution of this building block. Please check the command line documentation."""

302 parser = argparse.ArgumentParser(description="Wrapper of most of the imblearn.under_sampling methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))

303 parser.add_argument('--config', required=False, help='Configuration file')

304

305 # Specific args of each building block

306 required_args = parser.add_argument_group('required arguments')

307 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')

308 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.')

309

310 args = parser.parse_args()

311 args.config = args.config or "{}"

312 properties = settings.ConfReader(config=args.config).get_prop_dic()

313

314 # Specific call of each building block

315 undersampling(input_dataset_path=args.input_dataset_path,

316 output_dataset_path=args.output_dataset_path,

317 properties=properties)

318

319

320if __name__ == '__main__':

321 main()