Coverage for biobb_ml/resampling/undersampling.py: 78%

164 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-10-03 14:57 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the Undersampling class and the command line interface.""" 

4import argparse 

5import pandas as pd 

6import numpy as np 

7from collections import Counter 

8from biobb_common.generic.biobb_object import BiobbObject 

9from sklearn import preprocessing 

10from sklearn.model_selection import cross_val_score 

11from sklearn.model_selection import RepeatedStratifiedKFold 

12from sklearn.ensemble import RandomForestClassifier 

13from biobb_ml.resampling.reg_resampler import resampler 

14from biobb_common.configuration import settings 

15from biobb_common.tools import file_utils as fu 

16from biobb_common.tools.file_utils import launchlogger 

17from biobb_ml.resampling.common import check_input_path, check_output_path, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, getResamplingMethod, undersampling_methods 

18 

19 

20class Undersampling(BiobbObject): 

21 """ 

22 | biobb_ml Undersampling 

23 | Wrapper of most of the imblearn.under_sampling methods. 

24 | Remove samples from the majority class of a given dataset, with or without replacement. If regression is specified as type, the data will be resampled to classes in order to apply the undersampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_, `NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_, `CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_, `TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_, `EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_, `NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_, `ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_. 

25 

26 Args: 

27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752). 

28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_undersampling.csv>`_. Accepted formats: csv (edam:format_3752). 

29 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

30 * **method** (*str*) - (None) Undersampling method. It's a mandatory property. Values: random (`RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_: Under-sample the majority classes by randomly picking samples with or without replacement), nearmiss (`NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_: Class to perform under-sampling based on NearMiss methods), cnn (`CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_: Class to perform under-sampling based on the condensed nearest neighbour method), tomeklinks (`TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_: Class to perform under-sampling by removing Tomek's links), enn (`EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_: Class to perform under-sampling based on the edited nearest neighbour method), ncr (`NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_: Class performing under-sampling based on the neighbourhood cleaning rule), cluster (`ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_: Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm). 

31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset). 

32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked. 

33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling. 

34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2. 

35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated. 

36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression undersampling. The number of classes that the user wants to generate with the target data. 

37 * **balanced_binning** (*bool*) - (False) Only for regression undersampling. Decides whether samples are to be distributed roughly equally across all classes. 

38 * **sampling_strategy** (*dict*) - ({ "target": "auto" }) Sampling information to sample the data set. Formats: { "target": "auto" }, { "ratio": 0.3 }, { "dict": { 0: 300, 1: 200, 2: 100 } } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes, the values correspond to the desired number of samples for each targeted class. When "list", the list contains the classes targeted by the resampling. 

39 * **version** (*int*) - (1) Only for NearMiss method. Version of the NearMiss to use. Values: 1 (selects samples of the majority class that their average distances to three closest instances of the minority class are the smallest), 2 (uses three farthest samples of the minority class), 3 (selects a given number of the closest samples of the majority class for each sample of the minority class). 

40 * **n_neighbors** (*int*) - (1) [1~100|1] Only for NearMiss, CondensedNearestNeighbour, EditedNearestNeighbours and NeighbourhoodCleaningRule methods. Size of the neighbourhood to consider to compute the average distance to the minority point samples. 

41 * **threshold_cleaning** (*float*) - (0.5) [0~1|0.1] Only for NeighbourhoodCleaningRule method. Threshold used to whether consider a class or not during the cleaning after applying ENN. 

42 * **random_state_method** (*int*) - (5) [1~1000|1] Only for RandomUnderSampler and ClusterCentroids methods. Controls the randomization of the algorithm. 

43 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method. 

44 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

45 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

46 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

47 

48 Examples: 

49 This is a use example of how to use the building block from Python:: 

50 

51 from biobb_ml.resampling.undersampling import undersampling 

52 prop = { 

53 'method': 'enn', 

54 'type': 'regression', 

55 'target': { 

56 'column': 'target' 

57 }, 

58 'evaluate': true, 

59 'n_bins': 10, 

60 'n_neighbors': 3, 

61 'sampling_strategy': { 

62 'target': 'auto' 

63 } 

64 } 

65 undersampling(input_dataset_path='/path/to/myDataset.csv', 

66 output_dataset_path='/path/to/newDataset.csv', 

67 properties=prop) 

68 

69 Info: 

70 * wrapped_software: 

71 * name: imbalanced-learn under_sampling 

72 * version: >0.7.0 

73 * license: MIT 

74 * ontology: 

75 * name: EDAM 

76 * schema: http://edamontology.org/EDAM.owl 

77 

78 """ 

79 

80 def __init__(self, input_dataset_path, output_dataset_path, 

81 properties=None, **kwargs) -> None: 

82 properties = properties or {} 

83 

84 # Call parent class constructor 

85 super().__init__(properties) 

86 self.locals_var_dict = locals().copy() 

87 

88 # Input/Output files 

89 self.io_dict = { 

90 "in": {"input_dataset_path": input_dataset_path}, 

91 "out": {"output_dataset_path": output_dataset_path} 

92 } 

93 

94 # Properties specific for BB 

95 self.method = properties.get('method', None) 

96 self.type = properties.get('type', None) 

97 self.target = properties.get('target', {}) 

98 self.evaluate = properties.get('evaluate', False) 

99 self.evaluate_splits = properties.get('evaluate_splits', 3) 

100 self.evaluate_repeats = properties.get('evaluate_repeats', 3) 

101 self.n_bins = properties.get('n_bins', 5) 

102 self.balanced_binning = properties.get('balanced_binning', False) 

103 self.sampling_strategy = properties.get('sampling_strategy', {'target': 'auto'}) 

104 self.version = properties.get('version', 1) 

105 self.n_neighbors = properties.get('n_neighbors', 1) 

106 self.threshold_cleaning = properties.get('threshold_cleaning', 1) 

107 self.random_state_method = properties.get('random_state_method', 5) 

108 self.random_state_evaluate = properties.get('random_state_evaluate', 5) 

109 self.properties = properties 

110 

111 # Check the properties 

112 self.check_properties(properties) 

113 self.check_arguments() 

114 

115 def check_data_params(self, out_log, err_log): 

116 """ Checks all the input/output paths and parameters """ 

117 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__) 

118 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__) 

119 

120 @launchlogger 

121 def launch(self) -> int: 

122 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` resampling.undersampling.Undersampling object.""" 

123 

124 # check input/output paths and parameters 

125 self.check_data_params(self.out_log, self.err_log) 

126 

127 # Setup Biobb 

128 if self.check_restart(): 

129 return 0 

130 self.stage_files() 

131 

132 # check mandatory properties 

133 method = getResamplingMethod(self.method, 'undersampling', self.out_log, self.__class__.__name__) 

134 checkResamplingType(self.type, self.out_log, self.__class__.__name__) 

135 sampling_strategy = getSamplingStrategy(self.sampling_strategy, self.out_log, self.__class__.__name__) 

136 

137 # load dataset 

138 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log) 

139 if 'column' in self.target: 

140 labels = getHeader(self.io_dict["in"]["input_dataset_path"]) 

141 skiprows = 1 

142 header = 0 

143 else: 

144 labels = None 

145 skiprows = None 

146 header = None 

147 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels) 

148 

149 train_df = data 

150 ranges = None 

151 

152 le = preprocessing.LabelEncoder() 

153 

154 cols_encoded = [] 

155 for column in train_df: 

156 # if type object, LabelEncoder.fit_transform 

157 if train_df[column].dtypes == 'object': 

158 cols_encoded.append(column) 

159 train_df[column] = le.fit_transform(train_df[column]) 

160 

161 # defining X 

162 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)] 

163 # calling undersample method 

164 if self.method == 'random': 

165 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method) 

166 elif self.method == 'nearmiss': 

167 if self.version == 3: 

168 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors_ver3=self.n_neighbors) 

169 else: 

170 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors=self.n_neighbors) 

171 elif self.method == 'cnn': 

172 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors) 

173 elif self.method == 'tomeklinks': 

174 method = method(sampling_strategy=sampling_strategy) 

175 elif self.method == 'enn': 

176 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors) 

177 elif self.method == 'ncr': 

178 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors, threshold_cleaning=self.threshold_cleaning) 

179 elif self.method == 'cluster': 

180 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method) 

181 

182 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log) 

183 

184 # undersampling 

185 if self.type == 'regression': 

186 fu.log('Undersampling regression dataset, continuous data will be classified', self.out_log, self.global_log) 

187 # call resampler class for Regression ReSampling 

188 rs = resampler() 

189 # Create n_bins classes for the dataset 

190 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) 

191 # Get the under-sampled data 

192 final_X, final_y = rs.resample(method, train_df, y) 

193 elif self.type == 'classification': 

194 # get X and y 

195 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__) 

196 # fit and resample 

197 final_X, final_y = method.fit_resample(X, y) 

198 target_pos = None 

199 

200 # evaluate undersampling 

201 if self.evaluate: 

202 fu.log('Evaluating data before undersampling with RandomForestClassifier', self.out_log, self.global_log) 

203 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate) 

204 # evaluate model 

205 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1) 

206 if not np.isnan(np.mean(scores)): 

207 fu.log('Mean Accuracy before undersampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log) 

208 else: 

209 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) 

210 

211 # log distribution before undersampling 

212 dist = '' 

213 for k, v in Counter(y).items(): 

214 per = v / len(y) * 100 

215 rng = '' 

216 if ranges: 

217 rng = str(ranges[k]) 

218 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) 

219 fu.log('Classes distribution before undersampling:\n\n%s' % dist, self.out_log, self.global_log) 

220 

221 # join final_X and final_y in the output dataframe 

222 if header is None: 

223 # numpy 

224 out_df = np.column_stack((final_X, final_y)) 

225 else: 

226 # pandas 

227 out_df = final_X.join(final_y) 

228 

229 # if no header, convert np to pd 

230 if header is None: 

231 out_df = pd.DataFrame(data=out_df) 

232 

233 # if cols encoded, decode them 

234 if cols_encoded: 

235 for column in cols_encoded: 

236 if header is None: 

237 out_df = out_df.astype({column: int}) 

238 out_df[column] = le.inverse_transform(out_df[column].values.ravel()) 

239 

240 # if no header, target is in a different column 

241 if target_pos: 

242 t = target_pos 

243 else: 

244 t = getTargetValue(self.target, self.out_log, self.__class__.__name__) 

245 # log distribution after undersampling 

246 if self.type == 'regression': 

247 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) 

248 elif self.type == 'classification': 

249 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__) 

250 

251 dist = '' 

252 for k, v in Counter(y_out).items(): 

253 per = v / len(y_out) * 100 

254 rng = '' 

255 if ranges: 

256 rng = str(ranges[k]) 

257 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) 

258 fu.log('Classes distribution after undersampling:\n\n%s' % dist, self.out_log, self.global_log) 

259 

260 # evaluate undersampling 

261 if self.evaluate: 

262 fu.log('Evaluating data after undersampling with RandomForestClassifier', self.out_log, self.global_log) 

263 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42) 

264 # evaluate model 

265 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1) 

266 if not np.isnan(np.mean(scores)): 

267 fu.log('Mean Accuracy after undersampling a %s dataset with %s method: %.3f' % (self.type, undersampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log) 

268 else: 

269 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) 

270 

271 # save output 

272 hdr = False 

273 if header == 0: 

274 hdr = True 

275 fu.log('Saving undersampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log) 

276 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr) 

277 

278 # Copy files to host 

279 self.copy_to_host() 

280 

281 self.tmp_files.extend([ 

282 self.stage_io_dict.get("unique_dir") 

283 ]) 

284 self.remove_tmp_files() 

285 

286 self.check_arguments(output_files_created=True, raise_exception=False) 

287 

288 return 0 

289 

290 

291def undersampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int: 

292 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` class and 

293 execute the :meth:`launch() <resampling.undersampling.Undersampling.launch>` method.""" 

294 

295 return Undersampling(input_dataset_path=input_dataset_path, 

296 output_dataset_path=output_dataset_path, 

297 properties=properties, **kwargs).launch() 

298 

299 

300def main(): 

301 """Command line execution of this building block. Please check the command line documentation.""" 

302 parser = argparse.ArgumentParser(description="Wrapper of most of the imblearn.under_sampling methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

303 parser.add_argument('--config', required=False, help='Configuration file') 

304 

305 # Specific args of each building block 

306 required_args = parser.add_argument_group('required arguments') 

307 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.') 

308 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.') 

309 

310 args = parser.parse_args() 

311 args.config = args.config or "{}" 

312 properties = settings.ConfReader(config=args.config).get_prop_dic() 

313 

314 # Specific call of each building block 

315 undersampling(input_dataset_path=args.input_dataset_path, 

316 output_dataset_path=args.output_dataset_path, 

317 properties=properties) 

318 

319 

320if __name__ == '__main__': 

321 main()