Coverage for biobb_ml/resampling/resampling.py: 82%

152 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:39 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the Resampling class and the command line interface.""" 

4import argparse 

5import pandas as pd 

6import numpy as np 

7from collections import Counter 

8from biobb_common.generic.biobb_object import BiobbObject 

9from sklearn import preprocessing 

10from sklearn.model_selection import cross_val_score 

11from sklearn.model_selection import RepeatedStratifiedKFold 

12from sklearn.ensemble import RandomForestClassifier 

13from biobb_ml.resampling.reg_resampler import resampler 

14from biobb_common.configuration import settings 

15from biobb_common.tools import file_utils as fu 

16from biobb_common.tools.file_utils import launchlogger 

17from biobb_ml.resampling.common import check_input_path, check_output_path, getCombinedMethod, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, resampling_methods 

18 

19 

20class Resampling(BiobbObject): 

21 """ 

22 | biobb_ml Resampling 

23 | Wrapper of the imblearn.combine methods. 

24 | Combine over- and under-sampling methods to remove samples and supplement the dataset. If regression is specified as type, the data will be resampled to classes in order to apply the resampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `SMOTETomek <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html>`_, `SMOTEENN <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html>`_. 

25 

26 Args: 

27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752). 

28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_resampling.csv>`_. Accepted formats: csv (edam:format_3752). 

29 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

30 * **method** (*str*) - (None) Resampling method. It's a mandatory property. Values: smotetomek (`SMOTETomek <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html>`_: Class to perform over-sampling using SMOTE and cleaning using Tomek links), smotenn (`SMOTEENN <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html>`_: Class to perform over-sampling using SMOTE and cleaning using ENN). 

31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset). 

32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked. 

33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling. 

34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2. 

35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated. 

36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression resampling. The number of classes that the user wants to generate with the target data. 

37 * **balanced_binning** (*bool*) - (False) Only for regression resampling. Decides whether samples are to be distributed roughly equally across all classes. 

38 * **sampling_strategy_over** (*dict*) - ({ "target": "auto" }) Sampling information applied in the dataset oversampling process. Formats: { "target": "auto" }, { "ratio": 0.3 } or { "dict": { 0: 300, 1: 200, 2: 100 } }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: minority (resample only the minority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not majority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes and the values correspond to the desired number of samples for each targeted class. 

39 * **sampling_strategy_under** (*dict*) - ({ "target": "auto" }) Sampling information applied in the dataset cleaning process. Formats: { "target": "auto" } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "list", the list contains the classes targeted by the resampling. 

40 * **random_state_method** (*int*) - (5) [1~1000|1] Controls the randomization of the algorithm. 

41 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method. 

42 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

43 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

44 

45 Examples: 

46 This is a use example of how to use the building block from Python:: 

47 

48 from biobb_ml.resampling.resampling import resampling 

49 prop = { 

50 'method': 'smotenn', 

51 'type': 'regression', 

52 'target': { 

53 'column': 'target' 

54 }, 

55 'evaluate': true, 

56 'n_bins': 10, 

57 'sampling_strategy_over': { 

58 'dict': { '4': 1000, '5': 1000, '6': 1000, '7': 1000 } 

59 }, 

60 'sampling_strategy_under': { 

61 'list': [0,1] 

62 } 

63 } 

64 resampling(input_dataset_path='/path/to/myDataset.csv', 

65 output_dataset_path='/path/to/newDataset.csv', 

66 properties=prop) 

67 

68 Info: 

69 * wrapped_software: 

70 * name: imbalanced-learn combine 

71 * version: >0.7.0 

72 * license: MIT 

73 * ontology: 

74 * name: EDAM 

75 * schema: http://edamontology.org/EDAM.owl 

76 

77 """ 

78 

79 def __init__(self, input_dataset_path, output_dataset_path, 

80 properties=None, **kwargs) -> None: 

81 properties = properties or {} 

82 

83 # Call parent class constructor 

84 super().__init__(properties) 

85 self.locals_var_dict = locals().copy() 

86 

87 # Input/Output files 

88 self.io_dict = { 

89 "in": {"input_dataset_path": input_dataset_path}, 

90 "out": {"output_dataset_path": output_dataset_path} 

91 } 

92 

93 # Properties specific for BB 

94 self.method = properties.get('method', None) 

95 self.type = properties.get('type', None) 

96 self.target = properties.get('target', {}) 

97 self.evaluate = properties.get('evaluate', False) 

98 self.evaluate_splits = properties.get('evaluate_splits', 3) 

99 self.evaluate_repeats = properties.get('evaluate_repeats', 3) 

100 self.n_bins = properties.get('n_bins', 5) 

101 self.balanced_binning = properties.get('balanced_binning', False) 

102 self.sampling_strategy_over = properties.get('sampling_strategy_over', {'target': 'auto'}) 

103 self.sampling_strategy_under = properties.get('sampling_strategy_under', {'target': 'auto'}) 

104 self.random_state_method = properties.get('random_state_method', 5) 

105 self.random_state_evaluate = properties.get('random_state_evaluate', 5) 

106 self.properties = properties 

107 

108 # Check the properties 

109 self.check_properties(properties) 

110 self.check_arguments() 

111 

112 def check_data_params(self, out_log, err_log): 

113 """ Checks all the input/output paths and parameters """ 

114 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__) 

115 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__) 

116 

117 @launchlogger 

118 def launch(self) -> int: 

119 """Execute the :class:`Resampling <resampling.resampling.Resampling>` resampling.resampling.Resampling object.""" 

120 

121 # check input/output paths and parameters 

122 self.check_data_params(self.out_log, self.err_log) 

123 

124 # Setup Biobb 

125 if self.check_restart(): 

126 return 0 

127 self.stage_files() 

128 

129 # check mandatory properties 

130 method, over, under = getCombinedMethod(self.method, self.out_log, self.__class__.__name__) 

131 checkResamplingType(self.type, self.out_log, self.__class__.__name__) 

132 sampling_strategy_over = getSamplingStrategy(self.sampling_strategy_over, self.out_log, self.__class__.__name__) 

133 sampling_strategy_under = getSamplingStrategy(self.sampling_strategy_under, self.out_log, self.__class__.__name__) 

134 

135 # load dataset 

136 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log) 

137 if 'column' in self.target: 

138 labels = getHeader(self.io_dict["in"]["input_dataset_path"]) 

139 skiprows = 1 

140 header = 0 

141 else: 

142 labels = None 

143 skiprows = None 

144 header = None 

145 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels) 

146 

147 train_df = data 

148 ranges = None 

149 

150 le = preprocessing.LabelEncoder() 

151 

152 cols_encoded = [] 

153 for column in train_df: 

154 # if type object, LabelEncoder.fit_transform 

155 if train_df[column].dtypes == 'object': 

156 cols_encoded.append(column) 

157 train_df[column] = le.fit_transform(train_df[column]) 

158 

159 # defining X 

160 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)] 

161 # calling resample method 

162 if self.method == 'smotetomek': 

163 method = method(smote=over(sampling_strategy=sampling_strategy_over), tomek=under(sampling_strategy=sampling_strategy_under), random_state=self.random_state_method) 

164 elif self.method == 'smotenn': 

165 method = method(smote=over(sampling_strategy=sampling_strategy_over), enn=under(sampling_strategy=sampling_strategy_under), random_state=self.random_state_method) 

166 

167 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log) 

168 

169 # resampling 

170 if self.type == 'regression': 

171 fu.log('Resampling regression dataset, continuous data will be classified', self.out_log, self.global_log) 

172 # call resampler class for Regression ReSampling 

173 rs = resampler() 

174 # Create n_bins classes for the dataset 

175 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) 

176 # Get the re-sampled data 

177 final_X, final_y = rs.resample(method, train_df, y) 

178 elif self.type == 'classification': 

179 # get X and y 

180 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__) 

181 # fit and resample 

182 final_X, final_y = method.fit_resample(X, y) 

183 target_pos = None 

184 

185 # evaluate resampling 

186 if self.evaluate: 

187 fu.log('Evaluating data before resampling with RandomForestClassifier', self.out_log, self.global_log) 

188 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate) 

189 # evaluate model 

190 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1) 

191 if not np.isnan(np.mean(scores)): 

192 fu.log('Mean Accuracy before resampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log) 

193 else: 

194 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) 

195 

196 # log distribution before resampling 

197 dist = '' 

198 for k, v in Counter(y).items(): 

199 per = v / len(y) * 100 

200 rng = '' 

201 if ranges: 

202 rng = str(ranges[k]) 

203 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) 

204 fu.log('Classes distribution before resampling:\n\n%s' % dist, self.out_log, self.global_log) 

205 

206 # join final_X and final_y in the output dataframe 

207 if header is None: 

208 # numpy 

209 out_df = np.column_stack((final_X, final_y)) 

210 else: 

211 # pandas 

212 out_df = final_X.join(final_y) 

213 

214 # if no header, convert np to pd 

215 if header is None: 

216 out_df = pd.DataFrame(data=out_df) 

217 

218 # if cols encoded, decode them 

219 if cols_encoded: 

220 for column in cols_encoded: 

221 if header is None: 

222 out_df = out_df.astype({column: int}) 

223 out_df[column] = le.inverse_transform(out_df[column].values.ravel()) 

224 

225 # if no header, target is in a different column 

226 if target_pos: 

227 t = target_pos 

228 else: 

229 t = getTargetValue(self.target, self.out_log, self.__class__.__name__) 

230 # log distribution after resampling 

231 if self.type == 'regression': 

232 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0) 

233 elif self.type == 'classification': 

234 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__) 

235 

236 dist = '' 

237 for k, v in Counter(y_out).items(): 

238 per = v / len(y_out) * 100 

239 rng = '' 

240 if ranges: 

241 rng = str(ranges[k]) 

242 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng) 

243 fu.log('Classes distribution after resampling:\n\n%s' % dist, self.out_log, self.global_log) 

244 

245 # evaluate resampling 

246 if self.evaluate: 

247 fu.log('Evaluating data after resampling with RandomForestClassifier', self.out_log, self.global_log) 

248 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42) 

249 # evaluate model 

250 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1) 

251 if not np.isnan(np.mean(scores)): 

252 fu.log('Mean Accuracy after resampling a %s dataset with %s method: %.3f' % (self.type, resampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log) 

253 else: 

254 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log) 

255 

256 # save output 

257 hdr = False 

258 if header == 0: 

259 hdr = True 

260 fu.log('Saving resampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log) 

261 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr) 

262 

263 # Copy files to host 

264 self.copy_to_host() 

265 

266 self.tmp_files.extend([ 

267 self.stage_io_dict.get("unique_dir") 

268 ]) 

269 self.remove_tmp_files() 

270 

271 self.check_arguments(output_files_created=True, raise_exception=False) 

272 

273 return 0 

274 

275 

276def resampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int: 

277 """Execute the :class:`Resampling <resampling.resampling.Resampling>` class and 

278 execute the :meth:`launch() <resampling.resampling.Resampling.launch>` method.""" 

279 

280 return Resampling(input_dataset_path=input_dataset_path, 

281 output_dataset_path=output_dataset_path, 

282 properties=properties, **kwargs).launch() 

283 

284 

285def main(): 

286 """Command line execution of this building block. Please check the command line documentation.""" 

287 parser = argparse.ArgumentParser(description="Wrapper of the imblearn.combine methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

288 parser.add_argument('--config', required=False, help='Configuration file') 

289 

290 # Specific args of each building block 

291 required_args = parser.add_argument_group('required arguments') 

292 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.') 

293 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.') 

294 

295 args = parser.parse_args() 

296 args.config = args.config or "{}" 

297 properties = settings.ConfReader(config=args.config).get_prop_dic() 

298 

299 # Specific call of each building block 

300 resampling(input_dataset_path=args.input_dataset_path, 

301 output_dataset_path=args.output_dataset_path, 

302 properties=properties) 

303 

304 

305if __name__ == '__main__': 

306 main()