Coverage for biobb_ml/classification/support_vector_machine.py: 83%
150 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-03 14:57 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-03 14:57 +0000
1#!/usr/bin/env python3
3"""Module containing the SupportVectorMachine class and the command line interface."""
4import argparse
5import joblib
6import pandas as pd
7import numpy as np
8from biobb_common.generic.biobb_object import BiobbObject
9from sklearn.preprocessing import StandardScaler
10from sklearn.model_selection import train_test_split
11from sklearn.metrics import confusion_matrix, classification_report, log_loss
12from sklearn import svm
13from biobb_common.configuration import settings
14from biobb_common.tools import file_utils as fu
15from biobb_common.tools.file_utils import launchlogger
16from biobb_ml.classification.common import check_input_path, check_output_path, getHeader, getIndependentVars, getIndependentVarsList, getTarget, getTargetValue, getWeight, plotMultipleCM, plotBinaryClassifier
19class SupportVectorMachine(BiobbObject):
20 """
21 | biobb_ml SupportVectorMachine
22 | Wrapper of the scikit-learn SupportVectorMachine method.
23 | Trains and tests a given dataset and saves the model and scaler. Visit the `SupportVectorMachine documentation page <https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_ in the sklearn official website for further information.
25 Args:
26 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/classification/dataset_support_vector_machine.csv>`_. Accepted formats: csv (edam:format_3752).
27 output_model_path (str): Path to the output model file. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl>`_. Accepted formats: pkl (edam:format_3653).
28 output_test_table_path (str) (Optional): Path to the test table file. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_test_support_vector_machine.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_plot_path (str) (Optional): Path to the statistics plot. If target is binary it shows confusion matrix, distributions of the predicted probabilities of both classes and ROC curve. If target is non-binary it shows confusion matrix. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_plot_support_vector_machine.png>`_. Accepted formats: png (edam:format_3603).
30 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
31 * **independent_vars** (*dict*) - ({}) Independent variables you want to train from your dataset. You can specify either a list of columns names from your input dataset, a list of columns indexes or a range of columns indexes. Formats: { "columns": ["column1", "column2"] } or { "indexes": [0, 2, 3, 10, 11, 17] } or { "range": [[0, 20], [50, 102]] }. In case of mulitple formats, the first one will be picked.
32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
33 * **weight** (*dict*) - ({}) Weight variable from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
34 * **kernel** (*string*) - ("rbf") Specifies the kernel type to be used in the algorithm. Values: linear (It's used when the data is Linearly separable; that is; it can be separated using a single Line), poly (Represents the similarity of vectors -training samples- in a feature space over polynomials of the original variables; allowing learning of non-linear models), rbf (It's a function whose value depends on the distance from the origin or from some point), sigmoid (In Neural Networks field the bipolar sigmoid function is often used as an activation function for artificial neurons), precomputed (Precomputed kernel).
35 * **normalize_cm** (*bool*) - (False) Whether or not to normalize the confusion matrix.
36 * **random_state_method** (*int*) - (5) [1~1000|1] Controls the randomness of the estimator.
37 * **random_state_train_test** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the data before applying the split.
38 * **test_size** (*float*) - (0.2) [0~1|0.05] Represents the proportion of the dataset to include in the test split. It should be between 0.0 and 1.0.
39 * **scale** (*bool*) - (False) Whether or not to scale the input dataset.
40 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
41 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
42 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
44 Examples:
45 This is a use example of how to use the building block from Python::
47 from biobb_ml.classification.support_vector_machine import support_vector_machine
48 prop = {
49 'independent_vars': {
50 'columns': [ 'column1', 'column2', 'column3' ]
51 },
52 'target': {
53 'column': 'target'
54 },
55 'kernel': 'rbf',
56 'test_size': 0.2
57 }
58 support_vector_machine(input_dataset_path='/path/to/myDataset.csv',
59 output_model_path='/path/to/newModel.pkl',
60 output_test_table_path='/path/to/newTable.csv',
61 output_plot_path='/path/to/newPlot.png',
62 properties=prop)
64 Info:
65 * wrapped_software:
66 * name: scikit-learn SupportVectorMachine
67 * version: >=0.24.2
68 * license: BSD 3-Clause
69 * ontology:
70 * name: EDAM
71 * schema: http://edamontology.org/EDAM.owl
73 """
75 def __init__(self, input_dataset_path, output_model_path,
76 output_test_table_path=None, output_plot_path=None, properties=None, **kwargs) -> None:
77 properties = properties or {}
79 # Call parent class constructor
80 super().__init__(properties)
81 self.locals_var_dict = locals().copy()
83 # Input/Output files
84 self.io_dict = {
85 "in": {"input_dataset_path": input_dataset_path},
86 "out": {"output_model_path": output_model_path, "output_test_table_path": output_test_table_path, "output_plot_path": output_plot_path}
87 }
89 # Properties specific for BB
90 self.independent_vars = properties.get('independent_vars', {})
91 self.target = properties.get('target', {})
92 self.weight = properties.get('weight', {})
93 self.kernel = properties.get('kernel', 'rbf')
94 self.normalize_cm = properties.get('normalize_cm', False)
95 self.random_state_method = properties.get('random_state_method', 5)
96 self.random_state_train_test = properties.get('random_state_train_test', 5)
97 self.test_size = properties.get('test_size', 0.2)
98 self.scale = properties.get('scale', False)
99 self.properties = properties
101 # Check the properties
102 self.check_properties(properties)
103 self.check_arguments()
105 def check_data_params(self, out_log, err_log):
106 """ Checks all the input/output paths and parameters """
107 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)
108 self.io_dict["out"]["output_model_path"] = check_output_path(self.io_dict["out"]["output_model_path"], "output_model_path", False, out_log, self.__class__.__name__)
109 if self.io_dict["out"]["output_test_table_path"]:
110 self.io_dict["out"]["output_test_table_path"] = check_output_path(self.io_dict["out"]["output_test_table_path"], "output_test_table_path", True, out_log, self.__class__.__name__)
111 if self.io_dict["out"]["output_plot_path"]:
112 self.io_dict["out"]["output_plot_path"] = check_output_path(self.io_dict["out"]["output_plot_path"], "output_plot_path", True, out_log, self.__class__.__name__)
114 @launchlogger
115 def launch(self) -> int:
116 """Execute the :class:`SupportVectorMachine <classification.support_vector_machine.SupportVectorMachine>` classification.support_vector_machine.SupportVectorMachine object."""
118 # check input/output paths and parameters
119 self.check_data_params(self.out_log, self.err_log)
121 # Setup Biobb
122 if self.check_restart():
123 return 0
124 self.stage_files()
126 # load dataset
127 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)
128 if 'columns' in self.independent_vars:
129 labels = getHeader(self.io_dict["in"]["input_dataset_path"])
130 skiprows = 1
131 else:
132 labels = None
133 skiprows = None
134 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)
136 # declare inputs, targets and weights
137 # the inputs are all the independent variables
138 X = getIndependentVars(self.independent_vars, data, self.out_log, self.__class__.__name__)
139 fu.log('Independent variables: [%s]' % (getIndependentVarsList(self.independent_vars)), self.out_log, self.global_log)
140 # target
141 y = getTarget(self.target, data, self.out_log, self.__class__.__name__)
142 fu.log('Target: %s' % (getTargetValue(self.target)), self.out_log, self.global_log)
143 # weights
144 if self.weight:
145 w = getWeight(self.weight, data, self.out_log, self.__class__.__name__)
146 fu.log('Weight column provided', self.out_log, self.global_log)
148 # train / test split
149 fu.log('Creating train and test sets', self.out_log, self.global_log)
150 arrays_sets = (X, y)
151 # if user provide weights
152 if self.weight:
153 arrays_sets = arrays_sets + (w,)
154 X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(*arrays_sets, test_size=self.test_size, random_state=self.random_state_train_test)
155 else:
156 X_train, X_test, y_train, y_test = train_test_split(*arrays_sets, test_size=self.test_size, random_state=self.random_state_train_test)
158 # scale dataset
159 if self.scale:
160 fu.log('Scaling dataset', self.out_log, self.global_log)
161 scaler = StandardScaler()
162 X_train = scaler.fit_transform(X_train)
164 # classification
165 fu.log('Training dataset applying support vector machine', self.out_log, self.global_log)
166 model = svm.SVC(kernel=self.kernel, probability=True, random_state=self.random_state_method)
167 arrays_fit = (X_train, y_train)
168 # if user provide weights
169 if self.weight:
170 arrays_fit = arrays_fit + (w_train,)
172 model.fit(*arrays_fit)
174 y_hat_train = model.predict(X_train)
175 # classification report
176 cr_train = classification_report(y_train, y_hat_train)
177 # log loss
178 yhat_prob_train = model.predict_proba(X_train)
179 l_loss_train = log_loss(y_train, yhat_prob_train)
180 fu.log('Calculating scores and report for training dataset\n\nCLASSIFICATION REPORT\n\n%s\nLog loss: %.3f\n' % (cr_train, l_loss_train), self.out_log, self.global_log)
182 # compute confusion matrix
183 cnf_matrix_train = confusion_matrix(y_train, y_hat_train)
184 np.set_printoptions(precision=2)
185 if self.normalize_cm:
186 cnf_matrix_train = cnf_matrix_train.astype('float') / cnf_matrix_train.sum(axis=1)[:, np.newaxis]
187 cm_type = 'NORMALIZED CONFUSION MATRIX'
188 else:
189 cm_type = 'CONFUSION MATRIX, WITHOUT NORMALIZATION'
191 fu.log('Calculating confusion matrix for training dataset\n\n%s\n\n%s\n' % (cm_type, cnf_matrix_train), self.out_log, self.global_log)
193 if self.scale:
194 X_test = scaler.transform(X_test)
195 y_hat_test = model.predict(X_test)
196 test_table = pd.DataFrame()
197 y_hat_prob = model.predict_proba(X_test)
198 y_hat_prob = np.around(y_hat_prob, decimals=2)
199 y_hat_prob = tuple(map(tuple, y_hat_prob))
200 test_table['P' + np.array2string(np.unique(y_test))] = y_hat_prob
201 y_test = y_test.reset_index(drop=True)
202 test_table['target'] = y_test
203 fu.log('Testing\n\nTEST DATA\n\n%s\n' % test_table, self.out_log, self.global_log)
205 # classification report
206 cr = classification_report(y_test, y_hat_test)
207 # log loss
208 yhat_prob = model.predict_proba(X_test)
209 l_loss = log_loss(y_test, yhat_prob)
210 fu.log('Calculating scores and report for testing dataset\n\nCLASSIFICATION REPORT\n\n%s\nLog loss: %.3f\n' % (cr, l_loss), self.out_log, self.global_log)
212 # compute confusion matrix
213 cnf_matrix = confusion_matrix(y_test, y_hat_test)
214 np.set_printoptions(precision=2)
215 if self.normalize_cm:
216 cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
217 cm_type = 'NORMALIZED CONFUSION MATRIX'
218 else:
219 cm_type = 'CONFUSION MATRIX, WITHOUT NORMALIZATION'
221 fu.log('Calculating confusion matrix for testing dataset\n\n%s\n\n%s\n' % (cm_type, cnf_matrix), self.out_log, self.global_log)
223 if (self.io_dict["out"]["output_test_table_path"]):
224 fu.log('Saving testing data to %s' % self.io_dict["out"]["output_test_table_path"], self.out_log, self.global_log)
225 test_table.to_csv(self.io_dict["out"]["output_test_table_path"], index=False, header=True)
227 # plot
228 if self.io_dict["out"]["output_plot_path"]:
229 vs = y.unique().tolist()
230 vs.sort()
231 if len(vs) > 2:
232 plot = plotMultipleCM(cnf_matrix_train, cnf_matrix, self.normalize_cm, vs)
233 fu.log('Saving confusion matrix plot to %s' % self.io_dict["out"]["output_plot_path"], self.out_log, self.global_log)
234 else:
235 plot = plotBinaryClassifier(model, yhat_prob_train, yhat_prob, cnf_matrix_train, cnf_matrix, y_train, y_test, normalize=self.normalize_cm)
236 fu.log('Saving binary classifier evaluator plot to %s' % self.io_dict["out"]["output_plot_path"], self.out_log, self.global_log)
237 plot.savefig(self.io_dict["out"]["output_plot_path"], dpi=150)
239 # save model, scaler and parameters
240 tv = y.unique().tolist()
241 tv.sort()
242 variables = {
243 'target': self.target,
244 'independent_vars': self.independent_vars,
245 'scale': self.scale,
246 'target_values': tv
247 }
248 fu.log('Saving model to %s' % self.io_dict["out"]["output_model_path"], self.out_log, self.global_log)
249 with open(self.io_dict["out"]["output_model_path"], "wb") as f:
250 joblib.dump(model, f)
251 if self.scale:
252 joblib.dump(scaler, f)
253 joblib.dump(variables, f)
255 # Copy files to host
256 self.copy_to_host()
258 self.tmp_files.extend([
259 self.stage_io_dict.get("unique_dir")
260 ])
261 self.remove_tmp_files()
263 self.check_arguments(output_files_created=True, raise_exception=False)
265 return 0
268def support_vector_machine(input_dataset_path: str, output_model_path: str, output_test_table_path: str = None, output_plot_path: str = None, properties: dict = None, **kwargs) -> int:
269 """Execute the :class:`SupportVectorMachine <classification.support_vector_machine.SupportVectorMachine>` class and
270 execute the :meth:`launch() <classification.support_vector_machine.SupportVectorMachine.launch>` method."""
272 return SupportVectorMachine(input_dataset_path=input_dataset_path,
273 output_model_path=output_model_path,
274 output_test_table_path=output_test_table_path,
275 output_plot_path=output_plot_path,
276 properties=properties, **kwargs).launch()
279def main():
280 """Command line execution of this building block. Please check the command line documentation."""
281 parser = argparse.ArgumentParser(description="Wrapper of the scikit-learn SupportVectorMachine method.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
282 parser.add_argument('--config', required=False, help='Configuration file')
284 # Specific args of each building block
285 required_args = parser.add_argument_group('required arguments')
286 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')
287 required_args.add_argument('--output_model_path', required=True, help='Path to the output model file. Accepted formats: pkl.')
288 parser.add_argument('--output_test_table_path', required=False, help='Path to the test table file. Accepted formats: csv.')
289 parser.add_argument('--output_plot_path', required=False, help='Path to the statistics plot. If target is binary it shows confusion matrix, distributions of the predicted probabilities of both classes and ROC curve. If target is non-binary it shows confusion matrix. Accepted formats: png.')
291 args = parser.parse_args()
292 args.config = args.config or "{}"
293 properties = settings.ConfReader(config=args.config).get_prop_dic()
295 # Specific call of each building block
296 support_vector_machine(input_dataset_path=args.input_dataset_path,
297 output_model_path=args.output_model_path,
298 output_test_table_path=args.output_test_table_path,
299 output_plot_path=args.output_plot_path,
300 properties=properties)
303if __name__ == '__main__':
304 main()