Coverage for biobb_ml/classification/support_vector_machine.py: 83%
151 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:39 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:39 +0000
1#!/usr/bin/env python3
3"""Module containing the SupportVectorMachine class and the command line interface."""
4import argparse
5import joblib
6import pandas as pd
7import numpy as np
8from biobb_common.generic.biobb_object import BiobbObject
9from sklearn.preprocessing import StandardScaler
10from sklearn.model_selection import train_test_split
11from sklearn.metrics import confusion_matrix, classification_report, log_loss
12from sklearn import svm
13from biobb_common.configuration import settings
14from biobb_common.tools import file_utils as fu
15from biobb_common.tools.file_utils import launchlogger
16from biobb_ml.classification.common import check_input_path, check_output_path, getHeader, getIndependentVars, getIndependentVarsList, getTarget, getTargetValue, getWeight, plotMultipleCM, plotBinaryClassifier
19class SupportVectorMachine(BiobbObject):
20 """
21 | biobb_ml SupportVectorMachine
22 | Wrapper of the scikit-learn SupportVectorMachine method.
23 | Trains and tests a given dataset and saves the model and scaler. Visit the `SupportVectorMachine documentation page <https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_ in the sklearn official website for further information.
25 Args:
26 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/classification/dataset_support_vector_machine.csv>`_. Accepted formats: csv (edam:format_3752).
27 output_model_path (str): Path to the output model file. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl>`_. Accepted formats: pkl (edam:format_3653).
28 output_test_table_path (str) (Optional): Path to the test table file. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_test_support_vector_machine.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_plot_path (str) (Optional): Path to the statistics plot. If target is binary it shows confusion matrix, distributions of the predicted probabilities of both classes and ROC curve. If target is non-binary it shows confusion matrix. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_plot_support_vector_machine.png>`_. Accepted formats: png (edam:format_3603).
30 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
31 * **independent_vars** (*dict*) - ({}) Independent variables you want to train from your dataset. You can specify either a list of columns names from your input dataset, a list of columns indexes or a range of columns indexes. Formats: { "columns": ["column1", "column2"] } or { "indexes": [0, 2, 3, 10, 11, 17] } or { "range": [[0, 20], [50, 102]] }. In case of mulitple formats, the first one will be picked.
32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
33 * **weight** (*dict*) - ({}) Weight variable from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.
34 * **kernel** (*string*) - ("rbf") Specifies the kernel type to be used in the algorithm. Values: linear (It's used when the data is Linearly separable; that is; it can be separated using a single Line), poly (Represents the similarity of vectors -training samples- in a feature space over polynomials of the original variables; allowing learning of non-linear models), rbf (It's a function whose value depends on the distance from the origin or from some point), sigmoid (In Neural Networks field the bipolar sigmoid function is often used as an activation function for artificial neurons), precomputed (Precomputed kernel).
35 * **normalize_cm** (*bool*) - (False) Whether or not to normalize the confusion matrix.
36 * **random_state_method** (*int*) - (5) [1~1000|1] Controls the randomness of the estimator.
37 * **random_state_train_test** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the data before applying the split.
38 * **test_size** (*float*) - (0.2) [0~1|0.05] Represents the proportion of the dataset to include in the test split. It should be between 0.0 and 1.0.
39 * **scale** (*bool*) - (False) Whether or not to scale the input dataset.
40 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
41 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
43 Examples:
44 This is a use example of how to use the building block from Python::
46 from biobb_ml.classification.support_vector_machine import support_vector_machine
47 prop = {
48 'independent_vars': {
49 'columns': [ 'column1', 'column2', 'column3' ]
50 },
51 'target': {
52 'column': 'target'
53 },
54 'kernel': 'rbf',
55 'test_size': 0.2
56 }
57 support_vector_machine(input_dataset_path='/path/to/myDataset.csv',
58 output_model_path='/path/to/newModel.pkl',
59 output_test_table_path='/path/to/newTable.csv',
60 output_plot_path='/path/to/newPlot.png',
61 properties=prop)
63 Info:
64 * wrapped_software:
65 * name: scikit-learn SupportVectorMachine
66 * version: >=0.24.2
67 * license: BSD 3-Clause
68 * ontology:
69 * name: EDAM
70 * schema: http://edamontology.org/EDAM.owl
72 """
74 def __init__(self, input_dataset_path, output_model_path,
75 output_test_table_path=None, output_plot_path=None, properties=None, **kwargs) -> None:
76 properties = properties or {}
78 # Call parent class constructor
79 super().__init__(properties)
80 self.locals_var_dict = locals().copy()
82 # Input/Output files
83 self.io_dict = {
84 "in": {"input_dataset_path": input_dataset_path},
85 "out": {"output_model_path": output_model_path, "output_test_table_path": output_test_table_path, "output_plot_path": output_plot_path}
86 }
88 # Properties specific for BB
89 self.independent_vars = properties.get('independent_vars', {})
90 self.target = properties.get('target', {})
91 self.weight = properties.get('weight', {})
92 self.kernel = properties.get('kernel', 'rbf')
93 self.normalize_cm = properties.get('normalize_cm', False)
94 self.random_state_method = properties.get('random_state_method', 5)
95 self.random_state_train_test = properties.get('random_state_train_test', 5)
96 self.test_size = properties.get('test_size', 0.2)
97 self.scale = properties.get('scale', False)
98 self.properties = properties
100 # Check the properties
101 self.check_properties(properties)
102 self.check_arguments()
104 def check_data_params(self, out_log, err_log):
105 """ Checks all the input/output paths and parameters """
106 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)
107 self.io_dict["out"]["output_model_path"] = check_output_path(self.io_dict["out"]["output_model_path"], "output_model_path", False, out_log, self.__class__.__name__)
108 if self.io_dict["out"]["output_test_table_path"]:
109 self.io_dict["out"]["output_test_table_path"] = check_output_path(self.io_dict["out"]["output_test_table_path"], "output_test_table_path", True, out_log, self.__class__.__name__)
110 if self.io_dict["out"]["output_plot_path"]:
111 self.io_dict["out"]["output_plot_path"] = check_output_path(self.io_dict["out"]["output_plot_path"], "output_plot_path", True, out_log, self.__class__.__name__)
113 @launchlogger
114 def launch(self) -> int:
115 """Execute the :class:`SupportVectorMachine <classification.support_vector_machine.SupportVectorMachine>` classification.support_vector_machine.SupportVectorMachine object."""
117 # check input/output paths and parameters
118 self.check_data_params(self.out_log, self.err_log)
120 # Setup Biobb
121 if self.check_restart():
122 return 0
123 self.stage_files()
125 # load dataset
126 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)
127 if 'columns' in self.independent_vars:
128 labels = getHeader(self.io_dict["in"]["input_dataset_path"])
129 skiprows = 1
130 else:
131 labels = None
132 skiprows = None
133 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)
135 # declare inputs, targets and weights
136 # the inputs are all the independent variables
137 X = getIndependentVars(self.independent_vars, data, self.out_log, self.__class__.__name__)
138 fu.log('Independent variables: [%s]' % (getIndependentVarsList(self.independent_vars)), self.out_log, self.global_log)
139 # target
140 y = getTarget(self.target, data, self.out_log, self.__class__.__name__)
141 fu.log('Target: %s' % (getTargetValue(self.target)), self.out_log, self.global_log)
142 # weights
143 if self.weight:
144 w = getWeight(self.weight, data, self.out_log, self.__class__.__name__)
145 fu.log('Weight column provided', self.out_log, self.global_log)
147 # train / test split
148 fu.log('Creating train and test sets', self.out_log, self.global_log)
149 arrays_sets = (X, y)
150 # if user provide weights
151 if self.weight:
152 arrays_sets = arrays_sets + (w,)
153 X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(*arrays_sets, test_size=self.test_size, random_state=self.random_state_train_test)
154 else:
155 X_train, X_test, y_train, y_test = train_test_split(*arrays_sets, test_size=self.test_size, random_state=self.random_state_train_test)
157 # scale dataset
158 if self.scale:
159 fu.log('Scaling dataset', self.out_log, self.global_log)
160 scaler = StandardScaler()
161 X_train = scaler.fit_transform(X_train)
163 # classification
164 fu.log('Training dataset applying support vector machine', self.out_log, self.global_log)
165 model = svm.SVC(kernel=self.kernel, probability=True, random_state=self.random_state_method)
166 arrays_fit = (X_train, y_train)
167 # if user provide weights
168 if self.weight:
169 arrays_fit = arrays_fit + (w_train,)
171 model.fit(*arrays_fit)
173 y_hat_train = model.predict(X_train)
174 # classification report
175 cr_train = classification_report(y_train, y_hat_train)
176 # log loss
177 yhat_prob_train = model.predict_proba(X_train)
178 l_loss_train = log_loss(y_train, yhat_prob_train)
179 fu.log('Calculating scores and report for training dataset\n\nCLASSIFICATION REPORT\n\n%s\nLog loss: %.3f\n' % (cr_train, l_loss_train), self.out_log, self.global_log)
181 # compute confusion matrix
182 cnf_matrix_train = confusion_matrix(y_train, y_hat_train)
183 np.set_printoptions(precision=2)
184 if self.normalize_cm:
185 cnf_matrix_train = cnf_matrix_train.astype('float') / cnf_matrix_train.sum(axis=1)[:, np.newaxis]
186 cm_type = 'NORMALIZED CONFUSION MATRIX'
187 else:
188 cm_type = 'CONFUSION MATRIX, WITHOUT NORMALIZATION'
190 fu.log('Calculating confusion matrix for training dataset\n\n%s\n\n%s\n' % (cm_type, cnf_matrix_train), self.out_log, self.global_log)
192 if self.scale:
193 X_test = scaler.transform(X_test)
194 y_hat_test = model.predict(X_test)
195 test_table = pd.DataFrame()
196 y_hat_prob = model.predict_proba(X_test)
197 y_hat_prob = np.around(y_hat_prob, decimals=2)
198 y_hat_prob = tuple(map(tuple, y_hat_prob))
199 test_table['P' + np.array2string(np.unique(y_test))] = y_hat_prob
200 y_test = y_test.reset_index(drop=True)
201 test_table['target'] = y_test
202 fu.log('Testing\n\nTEST DATA\n\n%s\n' % test_table, self.out_log, self.global_log)
204 # classification report
205 cr = classification_report(y_test, y_hat_test)
206 # log loss
207 yhat_prob = model.predict_proba(X_test)
208 l_loss = log_loss(y_test, yhat_prob)
209 fu.log('Calculating scores and report for testing dataset\n\nCLASSIFICATION REPORT\n\n%s\nLog loss: %.3f\n' % (cr, l_loss), self.out_log, self.global_log)
211 # compute confusion matrix
212 cnf_matrix = confusion_matrix(y_test, y_hat_test)
213 np.set_printoptions(precision=2)
214 if self.normalize_cm:
215 cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
216 cm_type = 'NORMALIZED CONFUSION MATRIX'
217 else:
218 cm_type = 'CONFUSION MATRIX, WITHOUT NORMALIZATION'
220 fu.log('Calculating confusion matrix for testing dataset\n\n%s\n\n%s\n' % (cm_type, cnf_matrix), self.out_log, self.global_log)
222 if (self.io_dict["out"]["output_test_table_path"]):
223 fu.log('Saving testing data to %s' % self.io_dict["out"]["output_test_table_path"], self.out_log, self.global_log)
224 test_table.to_csv(self.io_dict["out"]["output_test_table_path"], index=False, header=True)
226 # plot
227 if self.io_dict["out"]["output_plot_path"]:
228 vs = y.unique().tolist()
229 vs.sort()
230 if len(vs) > 2:
231 plot = plotMultipleCM(cnf_matrix_train, cnf_matrix, self.normalize_cm, vs)
232 fu.log('Saving confusion matrix plot to %s' % self.io_dict["out"]["output_plot_path"], self.out_log, self.global_log)
233 else:
234 plot = plotBinaryClassifier(model, yhat_prob_train, yhat_prob, cnf_matrix_train, cnf_matrix, y_train, y_test, normalize=self.normalize_cm)
235 fu.log('Saving binary classifier evaluator plot to %s' % self.io_dict["out"]["output_plot_path"], self.out_log, self.global_log)
236 plot.savefig(self.io_dict["out"]["output_plot_path"], dpi=150)
238 # save model, scaler and parameters
239 tv = y.unique().tolist()
240 tv.sort()
241 variables = {
242 'target': self.target,
243 'independent_vars': self.independent_vars,
244 'scale': self.scale,
245 'target_values': tv
246 }
247 fu.log('Saving model to %s' % self.io_dict["out"]["output_model_path"], self.out_log, self.global_log)
248 with open(self.io_dict["out"]["output_model_path"], "wb") as f:
249 joblib.dump(model, f)
250 if self.scale:
251 joblib.dump(scaler, f)
252 joblib.dump(variables, f)
254 # Copy files to host
255 self.copy_to_host()
257 self.tmp_files.extend([
258 self.stage_io_dict.get("unique_dir")
259 ])
260 self.remove_tmp_files()
262 self.check_arguments(output_files_created=True, raise_exception=False)
264 return 0
267def support_vector_machine(input_dataset_path: str, output_model_path: str, output_test_table_path: str = None, output_plot_path: str = None, properties: dict = None, **kwargs) -> int:
268 """Execute the :class:`SupportVectorMachine <classification.support_vector_machine.SupportVectorMachine>` class and
269 execute the :meth:`launch() <classification.support_vector_machine.SupportVectorMachine.launch>` method."""
271 return SupportVectorMachine(input_dataset_path=input_dataset_path,
272 output_model_path=output_model_path,
273 output_test_table_path=output_test_table_path,
274 output_plot_path=output_plot_path,
275 properties=properties, **kwargs).launch()
278def main():
279 """Command line execution of this building block. Please check the command line documentation."""
280 parser = argparse.ArgumentParser(description="Wrapper of the scikit-learn SupportVectorMachine method.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
281 parser.add_argument('--config', required=False, help='Configuration file')
283 # Specific args of each building block
284 required_args = parser.add_argument_group('required arguments')
285 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')
286 required_args.add_argument('--output_model_path', required=True, help='Path to the output model file. Accepted formats: pkl.')
287 parser.add_argument('--output_test_table_path', required=False, help='Path to the test table file. Accepted formats: csv.')
288 parser.add_argument('--output_plot_path', required=False, help='Path to the statistics plot. If target is binary it shows confusion matrix, distributions of the predicted probabilities of both classes and ROC curve. If target is non-binary it shows confusion matrix. Accepted formats: png.')
290 args = parser.parse_args()
291 args.config = args.config or "{}"
292 properties = settings.ConfReader(config=args.config).get_prop_dic()
294 # Specific call of each building block
295 support_vector_machine(input_dataset_path=args.input_dataset_path,
296 output_model_path=args.output_model_path,
297 output_test_table_path=args.output_test_table_path,
298 output_plot_path=args.output_plot_path,
299 properties=properties)
302if __name__ == '__main__':
303 main()