Coverage for biobb_ml/resampling/resampling.py: 82%
152 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:39 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:39 +0000
1#!/usr/bin/env python3
3"""Module containing the Resampling class and the command line interface."""
4import argparse
5import pandas as pd
6import numpy as np
7from collections import Counter
8from biobb_common.generic.biobb_object import BiobbObject
9from sklearn import preprocessing
10from sklearn.model_selection import cross_val_score
11from sklearn.model_selection import RepeatedStratifiedKFold
12from sklearn.ensemble import RandomForestClassifier
13from biobb_ml.resampling.reg_resampler import resampler
14from biobb_common.configuration import settings
15from biobb_common.tools import file_utils as fu
16from biobb_common.tools.file_utils import launchlogger
17from biobb_ml.resampling.common import check_input_path, check_output_path, getCombinedMethod, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, resampling_methods
20class Resampling(BiobbObject):
21 """
22 | biobb_ml Resampling
23 | Wrapper of the imblearn.combine methods.
24 | Combine over- and under-sampling methods to remove samples and supplement the dataset. If regression is specified as type, the data will be resampled to classes in order to apply the resampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `SMOTETomek <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html>`_, `SMOTEENN <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html>`_.
26 Args:
27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752).
28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_resampling.csv>`_. Accepted formats: csv (edam:format_3752).
29 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
30 * **method** (*str*) - (None) Resampling method. It's a mandatory property. Values: smotetomek (`SMOTETomek <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html>`_: Class to perform over-sampling using SMOTE and cleaning using Tomek links), smotenn (`SMOTEENN <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html>`_: Class to perform over-sampling using SMOTE and cleaning using ENN).
31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset).
32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling.
34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2.
35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated.
36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression resampling. The number of classes that the user wants to generate with the target data.
37 * **balanced_binning** (*bool*) - (False) Only for regression resampling. Decides whether samples are to be distributed roughly equally across all classes.
38 * **sampling_strategy_over** (*dict*) - ({ "target": "auto" }) Sampling information applied in the dataset oversampling process. Formats: { "target": "auto" }, { "ratio": 0.3 } or { "dict": { 0: 300, 1: 200, 2: 100 } }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: minority (resample only the minority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not majority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes and the values correspond to the desired number of samples for each targeted class.
39 * **sampling_strategy_under** (*dict*) - ({ "target": "auto" }) Sampling information applied in the dataset cleaning process. Formats: { "target": "auto" } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "list", the list contains the classes targeted by the resampling.
40 * **random_state_method** (*int*) - (5) [1~1000|1] Controls the randomization of the algorithm.
41 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method.
42 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
43 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
45 Examples:
46 This is a use example of how to use the building block from Python::
48 from biobb_ml.resampling.resampling import resampling
49 prop = {
50 'method': 'smotenn',
51 'type': 'regression',
52 'target': {
53 'column': 'target'
54 },
55 'evaluate': true,
56 'n_bins': 10,
57 'sampling_strategy_over': {
58 'dict': { '4': 1000, '5': 1000, '6': 1000, '7': 1000 }
59 },
60 'sampling_strategy_under': {
61 'list': [0,1]
62 }
63 }
64 resampling(input_dataset_path='/path/to/myDataset.csv',
65 output_dataset_path='/path/to/newDataset.csv',
66 properties=prop)
68 Info:
69 * wrapped_software:
70 * name: imbalanced-learn combine
71 * version: >0.7.0
72 * license: MIT
73 * ontology:
74 * name: EDAM
75 * schema: http://edamontology.org/EDAM.owl
77 """
79 def __init__(self, input_dataset_path, output_dataset_path,
80 properties=None, **kwargs) -> None:
81 properties = properties or {}
83 # Call parent class constructor
84 super().__init__(properties)
85 self.locals_var_dict = locals().copy()
87 # Input/Output files
88 self.io_dict = {
89 "in": {"input_dataset_path": input_dataset_path},
90 "out": {"output_dataset_path": output_dataset_path}
91 }
93 # Properties specific for BB
94 self.method = properties.get('method', None)
95 self.type = properties.get('type', None)
96 self.target = properties.get('target', {})
97 self.evaluate = properties.get('evaluate', False)
98 self.evaluate_splits = properties.get('evaluate_splits', 3)
99 self.evaluate_repeats = properties.get('evaluate_repeats', 3)
100 self.n_bins = properties.get('n_bins', 5)
101 self.balanced_binning = properties.get('balanced_binning', False)
102 self.sampling_strategy_over = properties.get('sampling_strategy_over', {'target': 'auto'})
103 self.sampling_strategy_under = properties.get('sampling_strategy_under', {'target': 'auto'})
104 self.random_state_method = properties.get('random_state_method', 5)
105 self.random_state_evaluate = properties.get('random_state_evaluate', 5)
106 self.properties = properties
108 # Check the properties
109 self.check_properties(properties)
110 self.check_arguments()
112 def check_data_params(self, out_log, err_log):
113 """ Checks all the input/output paths and parameters """
114 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)
115 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__)
117 @launchlogger
118 def launch(self) -> int:
119 """Execute the :class:`Resampling <resampling.resampling.Resampling>` resampling.resampling.Resampling object."""
121 # check input/output paths and parameters
122 self.check_data_params(self.out_log, self.err_log)
124 # Setup Biobb
125 if self.check_restart():
126 return 0
127 self.stage_files()
129 # check mandatory properties
130 method, over, under = getCombinedMethod(self.method, self.out_log, self.__class__.__name__)
131 checkResamplingType(self.type, self.out_log, self.__class__.__name__)
132 sampling_strategy_over = getSamplingStrategy(self.sampling_strategy_over, self.out_log, self.__class__.__name__)
133 sampling_strategy_under = getSamplingStrategy(self.sampling_strategy_under, self.out_log, self.__class__.__name__)
135 # load dataset
136 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)
137 if 'column' in self.target:
138 labels = getHeader(self.io_dict["in"]["input_dataset_path"])
139 skiprows = 1
140 header = 0
141 else:
142 labels = None
143 skiprows = None
144 header = None
145 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)
147 train_df = data
148 ranges = None
150 le = preprocessing.LabelEncoder()
152 cols_encoded = []
153 for column in train_df:
154 # if type object, LabelEncoder.fit_transform
155 if train_df[column].dtypes == 'object':
156 cols_encoded.append(column)
157 train_df[column] = le.fit_transform(train_df[column])
159 # defining X
160 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)]
161 # calling resample method
162 if self.method == 'smotetomek':
163 method = method(smote=over(sampling_strategy=sampling_strategy_over), tomek=under(sampling_strategy=sampling_strategy_under), random_state=self.random_state_method)
164 elif self.method == 'smotenn':
165 method = method(smote=over(sampling_strategy=sampling_strategy_over), enn=under(sampling_strategy=sampling_strategy_under), random_state=self.random_state_method)
167 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log)
169 # resampling
170 if self.type == 'regression':
171 fu.log('Resampling regression dataset, continuous data will be classified', self.out_log, self.global_log)
172 # call resampler class for Regression ReSampling
173 rs = resampler()
174 # Create n_bins classes for the dataset
175 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)
176 # Get the re-sampled data
177 final_X, final_y = rs.resample(method, train_df, y)
178 elif self.type == 'classification':
179 # get X and y
180 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__)
181 # fit and resample
182 final_X, final_y = method.fit_resample(X, y)
183 target_pos = None
185 # evaluate resampling
186 if self.evaluate:
187 fu.log('Evaluating data before resampling with RandomForestClassifier', self.out_log, self.global_log)
188 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate)
189 # evaluate model
190 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1)
191 if not np.isnan(np.mean(scores)):
192 fu.log('Mean Accuracy before resampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log)
193 else:
194 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)
196 # log distribution before resampling
197 dist = ''
198 for k, v in Counter(y).items():
199 per = v / len(y) * 100
200 rng = ''
201 if ranges:
202 rng = str(ranges[k])
203 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
204 fu.log('Classes distribution before resampling:\n\n%s' % dist, self.out_log, self.global_log)
206 # join final_X and final_y in the output dataframe
207 if header is None:
208 # numpy
209 out_df = np.column_stack((final_X, final_y))
210 else:
211 # pandas
212 out_df = final_X.join(final_y)
214 # if no header, convert np to pd
215 if header is None:
216 out_df = pd.DataFrame(data=out_df)
218 # if cols encoded, decode them
219 if cols_encoded:
220 for column in cols_encoded:
221 if header is None:
222 out_df = out_df.astype({column: int})
223 out_df[column] = le.inverse_transform(out_df[column].values.ravel())
225 # if no header, target is in a different column
226 if target_pos:
227 t = target_pos
228 else:
229 t = getTargetValue(self.target, self.out_log, self.__class__.__name__)
230 # log distribution after resampling
231 if self.type == 'regression':
232 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)
233 elif self.type == 'classification':
234 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__)
236 dist = ''
237 for k, v in Counter(y_out).items():
238 per = v / len(y_out) * 100
239 rng = ''
240 if ranges:
241 rng = str(ranges[k])
242 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
243 fu.log('Classes distribution after resampling:\n\n%s' % dist, self.out_log, self.global_log)
245 # evaluate resampling
246 if self.evaluate:
247 fu.log('Evaluating data after resampling with RandomForestClassifier', self.out_log, self.global_log)
248 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)
249 # evaluate model
250 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1)
251 if not np.isnan(np.mean(scores)):
252 fu.log('Mean Accuracy after resampling a %s dataset with %s method: %.3f' % (self.type, resampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log)
253 else:
254 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)
256 # save output
257 hdr = False
258 if header == 0:
259 hdr = True
260 fu.log('Saving resampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log)
261 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr)
263 # Copy files to host
264 self.copy_to_host()
266 self.tmp_files.extend([
267 self.stage_io_dict.get("unique_dir")
268 ])
269 self.remove_tmp_files()
271 self.check_arguments(output_files_created=True, raise_exception=False)
273 return 0
276def resampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int:
277 """Execute the :class:`Resampling <resampling.resampling.Resampling>` class and
278 execute the :meth:`launch() <resampling.resampling.Resampling.launch>` method."""
280 return Resampling(input_dataset_path=input_dataset_path,
281 output_dataset_path=output_dataset_path,
282 properties=properties, **kwargs).launch()
285def main():
286 """Command line execution of this building block. Please check the command line documentation."""
287 parser = argparse.ArgumentParser(description="Wrapper of the imblearn.combine methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
288 parser.add_argument('--config', required=False, help='Configuration file')
290 # Specific args of each building block
291 required_args = parser.add_argument_group('required arguments')
292 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')
293 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.')
295 args = parser.parse_args()
296 args.config = args.config or "{}"
297 properties = settings.ConfReader(config=args.config).get_prop_dic()
299 # Specific call of each building block
300 resampling(input_dataset_path=args.input_dataset_path,
301 output_dataset_path=args.output_dataset_path,
302 properties=properties)
305if __name__ == '__main__':
306 main()