Coverage for biobb_ml/resampling/undersampling.py: 78%
165 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:39 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:39 +0000
1#!/usr/bin/env python3
3"""Module containing the Undersampling class and the command line interface."""
4import argparse
5import pandas as pd
6import numpy as np
7from collections import Counter
8from biobb_common.generic.biobb_object import BiobbObject
9from sklearn import preprocessing
10from sklearn.model_selection import cross_val_score
11from sklearn.model_selection import RepeatedStratifiedKFold
12from sklearn.ensemble import RandomForestClassifier
13from biobb_ml.resampling.reg_resampler import resampler
14from biobb_common.configuration import settings
15from biobb_common.tools import file_utils as fu
16from biobb_common.tools.file_utils import launchlogger
17from biobb_ml.resampling.common import check_input_path, check_output_path, checkResamplingType, getSamplingStrategy, getHeader, getTargetValue, getTarget, getResamplingMethod, undersampling_methods
20class Undersampling(BiobbObject):
21 """
22 | biobb_ml Undersampling
23 | Wrapper of most of the imblearn.under_sampling methods.
24 | Remove samples from the majority class of a given dataset, with or without replacement. If regression is specified as type, the data will be resampled to classes in order to apply the undersampling model. Visit the imbalanced-learn official website for the different methods accepted in this wrapper: `RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_, `NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_, `CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_, `TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_, `EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_, `NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_, `ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_.
26 Args:
27 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/resampling/dataset_resampling.csv>`_. Accepted formats: csv (edam:format_3752).
28 output_dataset_path (str): Path to the output dataset. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/resampling/ref_output_undersampling.csv>`_. Accepted formats: csv (edam:format_3752).
29 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
30 * **method** (*str*) - (None) Undersampling method. It's a mandatory property. Values: random (`RandomUnderSampler <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.RandomUnderSampler.html>`_: Under-sample the majority classes by randomly picking samples with or without replacement), nearmiss (`NearMiss <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NearMiss.html>`_: Class to perform under-sampling based on NearMiss methods), cnn (`CondensedNearestNeighbour <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.CondensedNearestNeighbour.html>`_: Class to perform under-sampling based on the condensed nearest neighbour method), tomeklinks (`TomekLinks <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html>`_: Class to perform under-sampling by removing Tomek's links), enn (`EditedNearestNeighbours <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html>`_: Class to perform under-sampling based on the edited nearest neighbour method), ncr (`NeighbourhoodCleaningRule <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.NeighbourhoodCleaningRule.html>`_: Class performing under-sampling based on the neighbourhood cleaning rule), cluster (`ClusterCentroids <https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.ClusterCentroids.html>`_: Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm).
31 * **type** (*str*) - (None) Type of oversampling. It's a mandatory property. Values: regression (the oversampling will be applied on a continuous dataset), classification (the oversampling will be applied on a classified dataset).
32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
33 * **evaluate** (*bool*) - (False) Whether or not to evaluate the dataset before and after applying the resampling.
34 * **evaluate_splits** (*int*) - (3) [2~100|1] Number of folds to be applied by the Repeated Stratified K-Fold evaluation method. Must be at least 2.
35 * **evaluate_repeats** (*int*) - (3) [2~100|1] Number of times Repeated Stratified K-Fold cross validator needs to be repeated.
36 * **n_bins** (*int*) - (5) [1~100|1] Only for regression undersampling. The number of classes that the user wants to generate with the target data.
37 * **balanced_binning** (*bool*) - (False) Only for regression undersampling. Decides whether samples are to be distributed roughly equally across all classes.
38 * **sampling_strategy** (*dict*) - ({ "target": "auto" }) Sampling information to sample the data set. Formats: { "target": "auto" }, { "ratio": 0.3 }, { "dict": { 0: 300, 1: 200, 2: 100 } } or { "list": [0, 2, 3] }. When "target", specify the class targeted by the resampling; the number of samples in the different classes will be equalized; possible choices are: majority (resample only the majority class), not minority (resample all classes but the minority class), not majority (resample all classes but the majority class), all (resample all classes), auto (equivalent to 'not minority'). When "ratio", it corresponds to the desired ratio of the number of samples in the minority class over the number of samples in the majority class after resampling (ONLY IN CASE OF BINARY CLASSIFICATION). When "dict", the keys correspond to the targeted classes, the values correspond to the desired number of samples for each targeted class. When "list", the list contains the classes targeted by the resampling.
39 * **version** (*int*) - (1) Only for NearMiss method. Version of the NearMiss to use. Values: 1 (selects samples of the majority class that their average distances to three closest instances of the minority class are the smallest), 2 (uses three farthest samples of the minority class), 3 (selects a given number of the closest samples of the majority class for each sample of the minority class).
40 * **n_neighbors** (*int*) - (1) [1~100|1] Only for NearMiss, CondensedNearestNeighbour, EditedNearestNeighbours and NeighbourhoodCleaningRule methods. Size of the neighbourhood to consider to compute the average distance to the minority point samples.
41 * **threshold_cleaning** (*float*) - (0.5) [0~1|0.1] Only for NeighbourhoodCleaningRule method. Threshold used to whether consider a class or not during the cleaning after applying ENN.
42 * **random_state_method** (*int*) - (5) [1~1000|1] Only for RandomUnderSampler and ClusterCentroids methods. Controls the randomization of the algorithm.
43 * **random_state_evaluate** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the Repeated Stratified K-Fold evaluation method.
44 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
45 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
47 Examples:
48 This is a use example of how to use the building block from Python::
50 from biobb_ml.resampling.undersampling import undersampling
51 prop = {
52 'method': 'enn',
53 'type': 'regression',
54 'target': {
55 'column': 'target'
56 },
57 'evaluate': true,
58 'n_bins': 10,
59 'n_neighbors': 3,
60 'sampling_strategy': {
61 'target': 'auto'
62 }
63 }
64 undersampling(input_dataset_path='/path/to/myDataset.csv',
65 output_dataset_path='/path/to/newDataset.csv',
66 properties=prop)
68 Info:
69 * wrapped_software:
70 * name: imbalanced-learn under_sampling
71 * version: >0.7.0
72 * license: MIT
73 * ontology:
74 * name: EDAM
75 * schema: http://edamontology.org/EDAM.owl
77 """
79 def __init__(self, input_dataset_path, output_dataset_path,
80 properties=None, **kwargs) -> None:
81 properties = properties or {}
83 # Call parent class constructor
84 super().__init__(properties)
85 self.locals_var_dict = locals().copy()
87 # Input/Output files
88 self.io_dict = {
89 "in": {"input_dataset_path": input_dataset_path},
90 "out": {"output_dataset_path": output_dataset_path}
91 }
93 # Properties specific for BB
94 self.method = properties.get('method', None)
95 self.type = properties.get('type', None)
96 self.target = properties.get('target', {})
97 self.evaluate = properties.get('evaluate', False)
98 self.evaluate_splits = properties.get('evaluate_splits', 3)
99 self.evaluate_repeats = properties.get('evaluate_repeats', 3)
100 self.n_bins = properties.get('n_bins', 5)
101 self.balanced_binning = properties.get('balanced_binning', False)
102 self.sampling_strategy = properties.get('sampling_strategy', {'target': 'auto'})
103 self.version = properties.get('version', 1)
104 self.n_neighbors = properties.get('n_neighbors', 1)
105 self.threshold_cleaning = properties.get('threshold_cleaning', 1)
106 self.random_state_method = properties.get('random_state_method', 5)
107 self.random_state_evaluate = properties.get('random_state_evaluate', 5)
108 self.properties = properties
110 # Check the properties
111 self.check_properties(properties)
112 self.check_arguments()
114 def check_data_params(self, out_log, err_log):
115 """ Checks all the input/output paths and parameters """
116 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)
117 self.io_dict["out"]["output_dataset_path"] = check_output_path(self.io_dict["out"]["output_dataset_path"], "output_dataset_path", False, out_log, self.__class__.__name__)
119 @launchlogger
120 def launch(self) -> int:
121 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` resampling.undersampling.Undersampling object."""
123 # check input/output paths and parameters
124 self.check_data_params(self.out_log, self.err_log)
126 # Setup Biobb
127 if self.check_restart():
128 return 0
129 self.stage_files()
131 # check mandatory properties
132 method = getResamplingMethod(self.method, 'undersampling', self.out_log, self.__class__.__name__)
133 checkResamplingType(self.type, self.out_log, self.__class__.__name__)
134 sampling_strategy = getSamplingStrategy(self.sampling_strategy, self.out_log, self.__class__.__name__)
136 # load dataset
137 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)
138 if 'column' in self.target:
139 labels = getHeader(self.io_dict["in"]["input_dataset_path"])
140 skiprows = 1
141 header = 0
142 else:
143 labels = None
144 skiprows = None
145 header = None
146 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)
148 train_df = data
149 ranges = None
151 le = preprocessing.LabelEncoder()
153 cols_encoded = []
154 for column in train_df:
155 # if type object, LabelEncoder.fit_transform
156 if train_df[column].dtypes == 'object':
157 cols_encoded.append(column)
158 train_df[column] = le.fit_transform(train_df[column])
160 # defining X
161 X = train_df.loc[:, train_df.columns != getTargetValue(self.target, self.out_log, self.__class__.__name__)]
162 # calling undersample method
163 if self.method == 'random':
164 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method)
165 elif self.method == 'nearmiss':
166 if self.version == 3:
167 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors_ver3=self.n_neighbors)
168 else:
169 method = method(sampling_strategy=sampling_strategy, version=self.version, n_neighbors=self.n_neighbors)
170 elif self.method == 'cnn':
171 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors)
172 elif self.method == 'tomeklinks':
173 method = method(sampling_strategy=sampling_strategy)
174 elif self.method == 'enn':
175 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors)
176 elif self.method == 'ncr':
177 method = method(sampling_strategy=sampling_strategy, n_neighbors=self.n_neighbors, threshold_cleaning=self.threshold_cleaning)
178 elif self.method == 'cluster':
179 method = method(sampling_strategy=sampling_strategy, random_state=self.random_state_method)
181 fu.log('Target: %s' % (getTargetValue(self.target, self.out_log, self.__class__.__name__)), self.out_log, self.global_log)
183 # undersampling
184 if self.type == 'regression':
185 fu.log('Undersampling regression dataset, continuous data will be classified', self.out_log, self.global_log)
186 # call resampler class for Regression ReSampling
187 rs = resampler()
188 # Create n_bins classes for the dataset
189 ranges, y, target_pos = rs.fit(train_df, target=getTargetValue(self.target, self.out_log, self.__class__.__name__), bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)
190 # Get the under-sampled data
191 final_X, final_y = rs.resample(method, train_df, y)
192 elif self.type == 'classification':
193 # get X and y
194 y = getTarget(self.target, train_df, self.out_log, self.__class__.__name__)
195 # fit and resample
196 final_X, final_y = method.fit_resample(X, y)
197 target_pos = None
199 # evaluate undersampling
200 if self.evaluate:
201 fu.log('Evaluating data before undersampling with RandomForestClassifier', self.out_log, self.global_log)
202 cv = RepeatedStratifiedKFold(n_splits=self.evaluate_splits, n_repeats=self.evaluate_repeats, random_state=self.random_state_evaluate)
203 # evaluate model
204 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), X, y, scoring='accuracy', cv=cv, n_jobs=-1)
205 if not np.isnan(np.mean(scores)):
206 fu.log('Mean Accuracy before undersampling: %.3f' % (np.mean(scores)), self.out_log, self.global_log)
207 else:
208 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)
210 # log distribution before undersampling
211 dist = ''
212 for k, v in Counter(y).items():
213 per = v / len(y) * 100
214 rng = ''
215 if ranges:
216 rng = str(ranges[k])
217 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
218 fu.log('Classes distribution before undersampling:\n\n%s' % dist, self.out_log, self.global_log)
220 # join final_X and final_y in the output dataframe
221 if header is None:
222 # numpy
223 out_df = np.column_stack((final_X, final_y))
224 else:
225 # pandas
226 out_df = final_X.join(final_y)
228 # if no header, convert np to pd
229 if header is None:
230 out_df = pd.DataFrame(data=out_df)
232 # if cols encoded, decode them
233 if cols_encoded:
234 for column in cols_encoded:
235 if header is None:
236 out_df = out_df.astype({column: int})
237 out_df[column] = le.inverse_transform(out_df[column].values.ravel())
239 # if no header, target is in a different column
240 if target_pos:
241 t = target_pos
242 else:
243 t = getTargetValue(self.target, self.out_log, self.__class__.__name__)
244 # log distribution after undersampling
245 if self.type == 'regression':
246 ranges, y_out, _ = rs.fit(out_df, target=t, bins=self.n_bins, balanced_binning=self.balanced_binning, verbose=0)
247 elif self.type == 'classification':
248 y_out = getTarget(self.target, out_df, self.out_log, self.__class__.__name__)
250 dist = ''
251 for k, v in Counter(y_out).items():
252 per = v / len(y_out) * 100
253 rng = ''
254 if ranges:
255 rng = str(ranges[k])
256 dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
257 fu.log('Classes distribution after undersampling:\n\n%s' % dist, self.out_log, self.global_log)
259 # evaluate undersampling
260 if self.evaluate:
261 fu.log('Evaluating data after undersampling with RandomForestClassifier', self.out_log, self.global_log)
262 cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)
263 # evaluate model
264 scores = cross_val_score(RandomForestClassifier(class_weight='balanced'), final_X, y_out, scoring='accuracy', cv=cv, n_jobs=-1)
265 if not np.isnan(np.mean(scores)):
266 fu.log('Mean Accuracy after undersampling a %s dataset with %s method: %.3f' % (self.type, undersampling_methods[self.method]['method'], np.mean(scores)), self.out_log, self.global_log)
267 else:
268 fu.log('Unable to calculate cross validation score, NaN was returned.', self.out_log, self.global_log)
270 # save output
271 hdr = False
272 if header == 0:
273 hdr = True
274 fu.log('Saving undersampled dataset to %s' % self.io_dict["out"]["output_dataset_path"], self.out_log, self.global_log)
275 out_df.to_csv(self.io_dict["out"]["output_dataset_path"], index=False, header=hdr)
277 # Copy files to host
278 self.copy_to_host()
280 self.tmp_files.extend([
281 self.stage_io_dict.get("unique_dir")
282 ])
283 self.remove_tmp_files()
285 self.check_arguments(output_files_created=True, raise_exception=False)
287 return 0
290def undersampling(input_dataset_path: str, output_dataset_path: str, properties: dict = None, **kwargs) -> int:
291 """Execute the :class:`Undersampling <resampling.undersampling.Undersampling>` class and
292 execute the :meth:`launch() <resampling.undersampling.Undersampling.launch>` method."""
294 return Undersampling(input_dataset_path=input_dataset_path,
295 output_dataset_path=output_dataset_path,
296 properties=properties, **kwargs).launch()
299def main():
300 """Command line execution of this building block. Please check the command line documentation."""
301 parser = argparse.ArgumentParser(description="Wrapper of most of the imblearn.under_sampling methods.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
302 parser.add_argument('--config', required=False, help='Configuration file')
304 # Specific args of each building block
305 required_args = parser.add_argument_group('required arguments')
306 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')
307 required_args.add_argument('--output_dataset_path', required=True, help='Path to the output dataset. Accepted formats: csv.')
309 args = parser.parse_args()
310 args.config = args.config or "{}"
311 properties = settings.ConfReader(config=args.config).get_prop_dic()
313 # Specific call of each building block
314 undersampling(input_dataset_path=args.input_dataset_path,
315 output_dataset_path=args.output_dataset_path,
316 properties=properties)
319if __name__ == '__main__':
320 main()