Coverage for biobb_ml/resampling/undersampling.py: 78%
164 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-03 14:57 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-03 14:57 +0000
1#!/usr/bin/env python3
3"""Module containing the Undersampling class and the command line interface."""
4import argparse
5import pandas as pd
6import numpy as np
7from collections import Counter
8from biobb_common.generic.biobb_object import BiobbObject
9from sklearn import preprocessing
10from sklearn.model_selection import cross_val_score
11from sklearn.model_selection import RepeatedStratifiedKFold
12from sklearn.ensemble import RandomForestClassifier
13from biobb_ml.resampling.reg_resampler import resampler
14from biobb_common.configuration import settings
15from biobb_common.tools import file_utils as fu
16from biobb_common.tools.file_utils import launchlogger
17from biobb_ml.resampling.common import check_input_path, check_output_path, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, getResamplingMethod, undersampling_methods
20class Undersampling(BiobbObject):
21 """
22 | biobb_ml Undersampling
23 | Wrapper of most of the imblearn.under_sampling methods.
24 | Remove samples from the majority class of a given dataset, with or without replacement. If regression is specified as type, the data will be resampled to classes in order to apply the undersampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_, `NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_, `CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_, `TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_, `EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_, `NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_, `ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_.
26 Args:
27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752).
28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_undersampling.csv>`_. Accepted formats: csv (edam:format_3752).
29 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
30 * **method** (*str*) - (None) Undersampling method. It's a mandatory property. Values: random (`RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_: Under-sample the majority classes by randomly picking samples with or without replacement), nearmiss (`NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_: Class to perform under-sampling based on NearMiss methods), cnn (`CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_: Class to perform under-sampling based on the condensed nearest neighbour method), tomeklinks (`TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_: Class to perform under-sampling by removing Tomek's links), enn (`EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_: Class to perform under-sampling based on the edited nearest neighbour method), ncr (`NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_: Class performing under-sampling based on the neighbourhood cleaning rule), cluster (`ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_: Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm).
31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset).
32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling.
34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2.
35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated.
36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression undersampling. The number of classes that the user wants to generate with the target data.
37 * **balanced_binning** (*bool*) - (False) Only for regression undersampling. Decides whether samples are to be distributed roughly equally across all classes.
38 * **sampling_strategy** (*dict*) - ({ "target": "auto" }) Sampling information to sample the data set. Formats: { "target": "auto" }, { "ratio": 0.3 }, { "dict": { 0: 300, 1: 200, 2: 100 } } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes, the values correspond to the desired number of samples for each targeted class. When "list", the list contains the classes targeted by the resampling.
39 * **version** (*int*) - (1) Only for NearMiss method. Version of the NearMiss to use. Values: 1 (selects samples of the majority class that their average distances to three closest instances of the minority class are the smallest), 2 (uses three farthest samples of the minority class), 3 (selects a given number of the closest samples of the majority class for each sample of the minority class).
40 * **n_neighbors** (*int*) - (1) [1~100|1] Only for NearMiss, CondensedNearestNeighbour, EditedNearestNeighbours and NeighbourhoodCleaningRule methods. Size of the neighbourhood to consider to compute the average distance to the minority point samples.
41 * **threshold_cleaning** (*float*) - (0.5) [0~1|0.1] Only for NeighbourhoodCleaningRule method. Threshold used to whether consider a class or not during the cleaning after applying ENN.
42 * **random_state_method** (*int*) - (5) [1~1000|1] Only for RandomUnderSampler and ClusterCentroids methods. Controls the randomization of the algorithm.
43 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method.
44 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
45 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
46 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
48 Examples:
49 This is a use example of how to use the building block from Python::
51 from biobb_ml.resampling.undersampling import undersampling
52 prop = {
53 'method': 'enn',
54 'type': 'regression',
55 'target': {
56 'column': 'target'
57 },
58 'evaluate': true,
59 'n_bins': 10,
60 'n_neighbors': 3,
61 'sampling_strategy': {
62 'target': 'auto'
63 }
64 }
65 undersampling(input_dataset_path='/path/to/myDataset.csv',
66 output_dataset_path='/path/to/newDataset.csv',
67 properties=prop)
69 Info:
70 * wrapped_software:
71 * name: imbalanced-learn under_sampling
72 * version: >0.7.0
73 * license: MIT
74 * ontology:
75 * name: EDAM
76 * schema: http://edamontology.org/EDAM.owl
78 """
80 def __init__(self, input_dataset_path, output_dataset_path,
81 properties=None, **kwargs) -> None:
82 properties = properties or {}
84 # Call parent class constructor
85 super().__init__(properties)
86 self.locals_var_dict = locals().copy()
88 # Input/Output files
89 self.io_dict = {
90 "in": {"input_dataset_path": input_dataset_path},
91 "out": {"output_dataset_path": output_dataset_path}
92 }
94 # Properties specific for BB
95 self.method = properties.get('method', None)
96 self.type = properties.get('type', None)
97 self.target = properties.get('target', {})
98 self.evaluate = properties.get('evaluate', False)
99 self.evaluate_splits = properties.get('evaluate_splits', 3)
100 self.evaluate_repeats = properties.get('evaluate_repeats', 3)
101 self.n_bins = properties.get('n_bins', 5)
102 self.balanced_binning = properties.get('balanced_binning', False)
103 self.sampling_strategy = properties.get('sampling_strategy', {'target': 'auto'})
104 self.version = properties.get('version', 1)
105 self.n_neighbors = properties.get('n_neighbors', 1)
106 self.threshold_cleaning = properties.get('threshold_cleaning', 1)
107 self.random_state_method = properties.get('random_state_method', 5)
108 self.random_state_evaluate = properties.get('random_state_evaluate', 5)
109 self.properties = properties
111 # Check the properties
112 self.check_properties(properties)
113 self.check_arguments()
115 def check_data_params(self, out_log, err_log):
116 """ Checks all the input/output paths and parameters """
117 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)
118 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__)
120 @launchlogger
121 def launch(self) -> int:
122 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` resampling.undersampling.Undersampling object."""
124 # check input/output paths and parameters
125 self.check_data_params(self.out_log, self.err_log)
127 # Setup Biobb
128 if self.check_restart():
129 return 0
130 self.stage_files()
132 # check mandatory properties
133 method = getResamplingMethod(self.method, 'undersampling', self.out_log, self.__class__.__name__)
134 checkResamplingType(self.type, self.out_log, self.__class__.__name__)
135 sampling_strategy = getSamplingStrategy(self.sampling_strategy, self.out_log, self.__class__.__name__)
137 # load dataset
138 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)
139 if 'column' in self.target:
140 labels = getHeader(self.io_dict["in"]["input_dataset_path"])
141 skiprows = 1
142 header = 0
143 else:
144 labels = None
145 skiprows = None
146 header = None
147 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)
149 train_df = data
150 ranges = None
152 le = preprocessing.LabelEncoder()
154 cols_encoded = []
155 for column in train_df:
156 # if type object, LabelEncoder.fit_transform
157 if train_df[column].dtypes == 'object':
158 cols_encoded.append(column)
159 train_df[column] = le.fit_transform(train_df[column])
161 # defining X
162 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)]
163 # calling undersample method
164 if self.method == 'random':
165 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method)
166 elif self.method == 'nearmiss':
167 if self.version == 3:
168 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors_ver3=self.n_neighbors)
169 else:
170 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors=self.n_neighbors)
171 elif self.method == 'cnn':
172 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors)
173 elif self.method == 'tomeklinks':
174 method = method(sampling_strategy=sampling_strategy)
175 elif self.method == 'enn':
176 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors)
177 elif self.method == 'ncr':
178 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors, threshold_cleaning=self.threshold_cleaning)
179 elif self.method == 'cluster':
180 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method)
182 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log)
184 # undersampling
185 if self.type == 'regression':
186 fu.log('Undersampling regression dataset, continuous data will be classified', self.out_log, self.global_log)
187 # call resampler class for Regression ReSampling
188 rs = resampler()
189 # Create n_bins classes for the dataset
190 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)
191 # Get the under-sampled data
192 final_X, final_y = rs.resample(method, train_df, y)
193 elif self.type == 'classification':
194 # get X and y
195 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__)
196 # fit and resample
197 final_X, final_y = method.fit_resample(X, y)
198 target_pos = None
200 # evaluate undersampling
201 if self.evaluate:
202 fu.log('Evaluating data before undersampling with RandomForestClassifier', self.out_log, self.global_log)
203 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate)
204 # evaluate model
205 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1)
206 if not np.isnan(np.mean(scores)):
207 fu.log('Mean Accuracy before undersampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log)
208 else:
209 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)
211 # log distribution before undersampling
212 dist = ''
213 for k, v in Counter(y).items():
214 per = v / len(y) * 100
215 rng = ''
216 if ranges:
217 rng = str(ranges[k])
218 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
219 fu.log('Classes distribution before undersampling:\n\n%s' % dist, self.out_log, self.global_log)
221 # join final_X and final_y in the output dataframe
222 if header is None:
223 # numpy
224 out_df = np.column_stack((final_X, final_y))
225 else:
226 # pandas
227 out_df = final_X.join(final_y)
229 # if no header, convert np to pd
230 if header is None:
231 out_df = pd.DataFrame(data=out_df)
233 # if cols encoded, decode them
234 if cols_encoded:
235 for column in cols_encoded:
236 if header is None:
237 out_df = out_df.astype({column: int})
238 out_df[column] = le.inverse_transform(out_df[column].values.ravel())
240 # if no header, target is in a different column
241 if target_pos:
242 t = target_pos
243 else:
244 t = getTargetValue(self.target, self.out_log, self.__class__.__name__)
245 # log distribution after undersampling
246 if self.type == 'regression':
247 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)
248 elif self.type == 'classification':
249 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__)
251 dist = ''
252 for k, v in Counter(y_out).items():
253 per = v / len(y_out) * 100
254 rng = ''
255 if ranges:
256 rng = str(ranges[k])
257 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
258 fu.log('Classes distribution after undersampling:\n\n%s' % dist, self.out_log, self.global_log)
260 # evaluate undersampling
261 if self.evaluate:
262 fu.log('Evaluating data after undersampling with RandomForestClassifier', self.out_log, self.global_log)
263 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)
264 # evaluate model
265 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1)
266 if not np.isnan(np.mean(scores)):
267 fu.log('Mean Accuracy after undersampling a %s dataset with %s method: %.3f' % (self.type, undersampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log)
268 else:
269 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)
271 # save output
272 hdr = False
273 if header == 0:
274 hdr = True
275 fu.log('Saving undersampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log)
276 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr)
278 # Copy files to host
279 self.copy_to_host()
281 self.tmp_files.extend([
282 self.stage_io_dict.get("unique_dir")
283 ])
284 self.remove_tmp_files()
286 self.check_arguments(output_files_created=True, raise_exception=False)
288 return 0
291def undersampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int:
292 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` class and
293 execute the :meth:`launch() <resampling.undersampling.Undersampling.launch>` method."""
295 return Undersampling(input_dataset_path=input_dataset_path,
296 output_dataset_path=output_dataset_path,
297 properties=properties, **kwargs).launch()
300def main():
301 """Command line execution of this building block. Please check the command line documentation."""
302 parser = argparse.ArgumentParser(description="Wrapper of most of the imblearn.under_sampling methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
303 parser.add_argument('--config', required=False, help='Configuration file')
305 # Specific args of each building block
306 required_args = parser.add_argument_group('required arguments')
307 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')
308 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.')
310 args = parser.parse_args()
311 args.config = args.config or "{}"
312 properties = settings.ConfReader(config=args.config).get_prop_dic()
314 # Specific call of each building block
315 undersampling(input_dataset_path=args.input_dataset_path,
316 output_dataset_path=args.output_dataset_path,
317 properties=properties)
320if __name__ == '__main__':
321 main()