Coverage for biobb_ml/resampling/undersampling.py: 78%

165 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:39 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the Undersampling class and the command line interface.""" 

4import argparse 

5import pandas as pd 

6import numpy as np 

7from collections import Counter 

8from biobb_common.generic.biobb_object import BiobbObject 

9from sklearn import preprocessing 

10from sklearn.model_selection import cross_val_score 

11from sklearn.model_selection import RepeatedStratifiedKFold 

12from sklearn.ensemble import RandomForestClassifier 

13from biobb_ml.resampling.reg_resampler import resampler 

14from biobb_common.configuration import settings 

15from biobb_common.tools import file_utils as fu 

16from biobb_common.tools.file_utils import launchlogger 

17from biobb_ml.resampling.common import check_input_path, check_output_path, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, getResamplingMethod, undersampling_methods 

18 

19 

20class Undersampling(BiobbObject): 

21 """ 

22 | biobb_ml Undersampling 

23 | Wrapper of most of the imblearn.under_sampling methods. 

24 | Remove samples from the majority class of a given dataset, with or without replacement. If regression is specified as type, the data will be resampled to classes in order to apply the undersampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_, `NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_, `CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_, `TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_, `EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_, `NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_, `ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_. 

25 

26 Args: 

27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752). 

28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_undersampling.csv>`_. Accepted formats: csv (edam:format_3752). 

29 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

30 * **method** (*str*) - (None) Undersampling method. It's a mandatory property. Values: random (`RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_: Under-sample the majority classes by randomly picking samples with or without replacement), nearmiss (`NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_: Class to perform under-sampling based on NearMiss methods), cnn (`CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_: Class to perform under-sampling based on the condensed nearest neighbour method), tomeklinks (`TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_: Class to perform under-sampling by removing Tomek's links), enn (`EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_: Class to perform under-sampling based on the edited nearest neighbour method), ncr (`NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_: Class performing under-sampling based on the neighbourhood cleaning rule), cluster (`ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_: Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm). 

31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset). 

32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked. 

33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling. 

34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2. 

35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated. 

36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression undersampling. The number of classes that the user wants to generate with the target data. 

37 * **balanced_binning** (*bool*) - (False) Only for regression undersampling. Decides whether samples are to be distributed roughly equally across all classes. 

38 * **sampling_strategy** (*dict*) - ({ "target": "auto" }) Sampling information to sample the data set. Formats: { "target": "auto" }, { "ratio": 0.3 }, { "dict": { 0: 300, 1: 200, 2: 100 } } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes, the values correspond to the desired number of samples for each targeted class. When "list", the list contains the classes targeted by the resampling. 

39 * **version** (*int*) - (1) Only for NearMiss method. Version of the NearMiss to use. Values: 1 (selects samples of the majority class that their average distances to three closest instances of the minority class are the smallest), 2 (uses three farthest samples of the minority class), 3 (selects a given number of the closest samples of the majority class for each sample of the minority class). 

40 * **n_neighbors** (*int*) - (1) [1~100|1] Only for NearMiss, CondensedNearestNeighbour, EditedNearestNeighbours and NeighbourhoodCleaningRule methods. Size of the neighbourhood to consider to compute the average distance to the minority point samples. 

41 * **threshold_cleaning** (*float*) - (0.5) [0~1|0.1] Only for NeighbourhoodCleaningRule method. Threshold used to whether consider a class or not during the cleaning after applying ENN. 

42 * **random_state_method** (*int*) - (5) [1~1000|1] Only for RandomUnderSampler and ClusterCentroids methods. Controls the randomization of the algorithm. 

43 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method. 

44 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

45 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

46 

47 Examples: 

48 This is a use example of how to use the building block from Python:: 

49 

50 from biobb_ml.resampling.undersampling import undersampling 

51 prop = { 

52 'method': 'enn', 

53 'type': 'regression', 

54 'target': { 

55 'column': 'target' 

56 }, 

57 'evaluate': true, 

58 'n_bins': 10, 

59 'n_neighbors': 3, 

60 'sampling_strategy': { 

61 'target': 'auto' 

62 } 

63 } 

64 undersampling(input_dataset_path='/path/to/myDataset.csv', 

65 output_dataset_path='/path/to/newDataset.csv', 

66 properties=prop) 

67 

68 Info: 

69 * wrapped_software: 

70 * name: imbalanced-learn under_sampling 

71 * version: >0.7.0 

72 * license: MIT 

73 * ontology: 

74 * name: EDAM 

75 * schema: http://edamontology.org/EDAM.owl 

76 

77 """ 

78 

79 def __init__(self, input_dataset_path, output_dataset_path, 

80 properties=None, **kwargs) -> None: 

81 properties = properties or {} 

82 

83 # Call parent class constructor 

84 super().__init__(properties) 

85 self.locals_var_dict = locals().copy() 

86 

87 # Input/Output files 

88 self.io_dict = { 

89 "in": {"input_dataset_path": input_dataset_path}, 

90 "out": {"output_dataset_path": output_dataset_path} 

91 } 

92 

93 # Properties specific for BB 

94 self.method = properties.get('method', None) 

95 self.type = properties.get('type', None) 

96 self.target = properties.get('target', {}) 

97 self.evaluate = properties.get('evaluate', False) 

98 self.evaluate_splits = properties.get('evaluate_splits', 3) 

99 self.evaluate_repeats = properties.get('evaluate_repeats', 3) 

100 self.n_bins = properties.get('n_bins', 5) 

101 self.balanced_binning = properties.get('balanced_binning', False) 

102 self.sampling_strategy = properties.get('sampling_strategy', {'target': 'auto'}) 

103 self.version = properties.get('version', 1) 

104 self.n_neighbors = properties.get('n_neighbors', 1) 

105 self.threshold_cleaning = properties.get('threshold_cleaning', 1) 

106 self.random_state_method = properties.get('random_state_method', 5) 

107 self.random_state_evaluate = properties.get('random_state_evaluate', 5) 

108 self.properties = properties 

109 

110 # Check the properties 

111 self.check_properties(properties) 

112 self.check_arguments() 

113 

114 def check_data_params(self, out_log, err_log): 

115 """ Checks all the input/output paths and parameters """ 

116 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__) 

117 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__) 

118 

119 @launchlogger 

120 def launch(self) -> int: 

121 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` resampling.undersampling.Undersampling object.""" 

122 

123 # check input/output paths and parameters 

124 self.check_data_params(self.out_log, self.err_log) 

125 

126 # Setup Biobb 

127 if self.check_restart(): 

128 return 0 

129 self.stage_files() 

130 

131 # check mandatory properties 

132 method = getResamplingMethod(self.method, 'undersampling', self.out_log, self.__class__.__name__) 

133 checkResamplingType(self.type, self.out_log, self.__class__.__name__) 

134 sampling_strategy = getSamplingStrategy(self.sampling_strategy, self.out_log, self.__class__.__name__) 

135 

136 # load dataset 

137 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log) 

138 if 'column' in self.target: 

139 labels = getHeader(self.io_dict["in"]["input_dataset_path"]) 

140 skiprows = 1 

141 header = 0 

142 else: 

143 labels = None 

144 skiprows = None 

145 header = None 

146 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels) 

147 

148 train_df = data 

149 ranges = None 

150 

151 le = preprocessing.LabelEncoder() 

152 

153 cols_encoded = [] 

154 for column in train_df: 

155 # if type object, LabelEncoder.fit_transform 

156 if train_df[column].dtypes == 'object': 

157 cols_encoded.append(column) 

158 train_df[column] = le.fit_transform(train_df[column]) 

159 

160 # defining X 

161 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)] 

162 # calling undersample method 

163 if self.method == 'random': 

164 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method) 

165 elif self.method == 'nearmiss': 

166 if self.version == 3: 

167 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors_ver3=self.n_neighbors) 

168 else: 

169 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors=self.n_neighbors) 

170 elif self.method == 'cnn': 

171 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors) 

172 elif self.method == 'tomeklinks': 

173 method = method(sampling_strategy=sampling_strategy) 

174 elif self.method == 'enn': 

175 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors) 

176 elif self.method == 'ncr': 

177 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors, threshold_cleaning=self.threshold_cleaning) 

178 elif self.method == 'cluster': 

179 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method) 

180 

181 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log) 

182 

183 # undersampling 

184 if self.type == 'regression': 

185 fu.log('Undersampling regression dataset, continuous data will be classified', self.out_log, self.global_log) 

186 # call resampler class for Regression ReSampling 

187 rs = resampler() 

188 # Create n_bins classes for the dataset 

189 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) 

190 # Get the under-sampled data 

191 final_X, final_y = rs.resample(method, train_df, y) 

192 elif self.type == 'classification': 

193 # get X and y 

194 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__) 

195 # fit and resample 

196 final_X, final_y = method.fit_resample(X, y) 

197 target_pos = None 

198 

199 # evaluate undersampling 

200 if self.evaluate: 

201 fu.log('Evaluating data before undersampling with RandomForestClassifier', self.out_log, self.global_log) 

202 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate) 

203 # evaluate model 

204 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1) 

205 if not np.isnan(np.mean(scores)): 

206 fu.log('Mean Accuracy before undersampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log) 

207 else: 

208 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) 

209 

210 # log distribution before undersampling 

211 dist = '' 

212 for k, v in Counter(y).items(): 

213 per = v / len(y) * 100 

214 rng = '' 

215 if ranges: 

216 rng = str(ranges[k]) 

217 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) 

218 fu.log('Classes distribution before undersampling:\n\n%s' % dist, self.out_log, self.global_log) 

219 

220 # join final_X and final_y in the output dataframe 

221 if header is None: 

222 # numpy 

223 out_df = np.column_stack((final_X, final_y)) 

224 else: 

225 # pandas 

226 out_df = final_X.join(final_y) 

227 

228 # if no header, convert np to pd 

229 if header is None: 

230 out_df = pd.DataFrame(data=out_df) 

231 

232 # if cols encoded, decode them 

233 if cols_encoded: 

234 for column in cols_encoded: 

235 if header is None: 

236 out_df = out_df.astype({column: int}) 

237 out_df[column] = le.inverse_transform(out_df[column].values.ravel()) 

238 

239 # if no header, target is in a different column 

240 if target_pos: 

241 t = target_pos 

242 else: 

243 t = getTargetValue(self.target, self.out_log, self.__class__.__name__) 

244 # log distribution after undersampling 

245 if self.type == 'regression': 

246 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) 

247 elif self.type == 'classification': 

248 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__) 

249 

250 dist = '' 

251 for k, v in Counter(y_out).items(): 

252 per = v / len(y_out) * 100 

253 rng = '' 

254 if ranges: 

255 rng = str(ranges[k]) 

256 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) 

257 fu.log('Classes distribution after undersampling:\n\n%s' % dist, self.out_log, self.global_log) 

258 

259 # evaluate undersampling 

260 if self.evaluate: 

261 fu.log('Evaluating data after undersampling with RandomForestClassifier', self.out_log, self.global_log) 

262 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42) 

263 # evaluate model 

264 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1) 

265 if not np.isnan(np.mean(scores)): 

266 fu.log('Mean Accuracy after undersampling a %s dataset with %s method: %.3f' % (self.type, undersampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log) 

267 else: 

268 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) 

269 

270 # save output 

271 hdr = False 

272 if header == 0: 

273 hdr = True 

274 fu.log('Saving undersampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log) 

275 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr) 

276 

277 # Copy files to host 

278 self.copy_to_host() 

279 

280 self.tmp_files.extend([ 

281 self.stage_io_dict.get("unique_dir") 

282 ]) 

283 self.remove_tmp_files() 

284 

285 self.check_arguments(output_files_created=True, raise_exception=False) 

286 

287 return 0 

288 

289 

290def undersampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int: 

291 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` class and 

292 execute the :meth:`launch() <resampling.undersampling.Undersampling.launch>` method.""" 

293 

294 return Undersampling(input_dataset_path=input_dataset_path, 

295 output_dataset_path=output_dataset_path, 

296 properties=properties, **kwargs).launch() 

297 

298 

299def main(): 

300 """Command line execution of this building block. Please check the command line documentation.""" 

301 parser = argparse.ArgumentParser(description="Wrapper of most of the imblearn.under_sampling methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

302 parser.add_argument('--config', required=False, help='Configuration file') 

303 

304 # Specific args of each building block 

305 required_args = parser.add_argument_group('required arguments') 

306 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.') 

307 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.') 

308 

309 args = parser.parse_args() 

310 args.config = args.config or "{}" 

311 properties = settings.ConfReader(config=args.config).get_prop_dic() 

312 

313 # Specific call of each building block 

314 undersampling(input_dataset_path=args.input_dataset_path, 

315 output_dataset_path=args.output_dataset_path, 

316 properties=properties) 

317 

318 

319if __name__ == '__main__': 

320 main()