Coverage for biobb_ml/classification/support_vector

1#!/usr/bin/env python3

3"""Module containing the SupportVectorMachine class and the command line interface."""

4import argparse

5import joblib

6import pandas as pd

7import numpy as np

8from biobb_common.generic.biobb_object import BiobbObject

9from sklearn.preprocessing import StandardScaler

10from sklearn.model_selection import train_test_split

11from sklearn.metrics import confusion_matrix, classification_report, log_loss

12from sklearn import svm

13from biobb_common.configuration import settings

14from biobb_common.tools import file_utils as fu

15from biobb_common.tools.file_utils import launchlogger

16from biobb_ml.classification.common import check_input_path, check_output_path, getHeader, getIndependentVars, getIndependentVarsList, getTarget, getTargetValue, getWeight, plotMultipleCM, plotBinaryClassifier

19class SupportVectorMachine(BiobbObject):

20 """

21 | biobb_ml SupportVectorMachine

22 | Wrapper of the scikit-learn SupportVectorMachine method.

23 | Trains and tests a given dataset and saves the model and scaler. Visit the `SupportVectorMachine documentation page <https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_ in the sklearn official website for further information.

25 Args:

26 input_dataset_path (str): Path to the input dataset. File type: input. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/data/classification/dataset_support_vector_machine.csv>`_. Accepted formats: csv (edam:format_3752).

27 output_model_path (str): Path to the output model file. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_model_support_vector_machine.pkl>`_. Accepted formats: pkl (edam:format_3653).

28 output_test_table_path (str) (Optional): Path to the test table file. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_test_support_vector_machine.csv>`_. Accepted formats: csv (edam:format_3752).

29 output_plot_path (str) (Optional): Path to the statistics plot. If target is binary it shows confusion matrix, distributions of the predicted probabilities of both classes and ROC curve. If target is non-binary it shows confusion matrix. File type: output. `Sample file <https://github.com/bioexcel/biobb_ml/raw/master/biobb_ml/test/reference/classification/ref_output_plot_support_vector_machine.png>`_. Accepted formats: png (edam:format_3603).

30 properties (dic - Python dictionary object containing the tool parameters, not input/output files):

31 * **independent_vars** (*dict*) - ({}) Independent variables you want to train from your dataset. You can specify either a list of columns names from your input dataset, a list of columns indexes or a range of columns indexes. Formats: { "columns": ["column1", "column2"] } or { "indexes": [0, 2, 3, 10, 11, 17] } or { "range": [[0, 20], [50, 102]] }. In case of mulitple formats, the first one will be picked.

32 * **target** (*dict*) - ({}) Dependent variable you want to predict from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.

33 * **weight** (*dict*) - ({}) Weight variable from your dataset. You can specify either a column name or a column index. Formats: { "column": "column3" } or { "index": 21 }. In case of mulitple formats, the first one will be picked.

34 * **kernel** (*string*) - ("rbf") Specifies the kernel type to be used in the algorithm. Values: linear (It's used when the data is Linearly separable; that is; it can be separated using a single Line), poly (Represents the similarity of vectors -training samples- in a feature space over polynomials of the original variables; allowing learning of non-linear models), rbf (It's a function whose value depends on the distance from the origin or from some point), sigmoid (In Neural Networks field the bipolar sigmoid function is often used as an activation function for artificial neurons), precomputed (Precomputed kernel).

35 * **normalize_cm** (*bool*) - (False) Whether or not to normalize the confusion matrix.

36 * **random_state_method** (*int*) - (5) [1~1000|1] Controls the randomness of the estimator.

37 * **random_state_train_test** (*int*) - (5) [1~1000|1] Controls the shuffling applied to the data before applying the split.

38 * **test_size** (*float*) - (0.2) [0~1|0.05] Represents the proportion of the dataset to include in the test split. It should be between 0.0 and 1.0.

39 * **scale** (*bool*) - (False) Whether or not to scale the input dataset.

40 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.

41 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

42 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.

44 Examples:

45 This is a use example of how to use the building block from Python::

47 from biobb_ml.classification.support_vector_machine import support_vector_machine

48 prop = {

49 'independent_vars': {

50 'columns': [ 'column1', 'column2', 'column3' ]

51 },

52 'target': {

53 'column': 'target'

54 },

55 'kernel': 'rbf',

56 'test_size': 0.2

57 }

58 support_vector_machine(input_dataset_path='/path/to/myDataset.csv',

59 output_model_path='/path/to/newModel.pkl',

60 output_test_table_path='/path/to/newTable.csv',

61 output_plot_path='/path/to/newPlot.png',

62 properties=prop)

64 Info:

65 * wrapped_software:

66 * name: scikit-learn SupportVectorMachine

67 * version: >=0.24.2

68 * license: BSD 3-Clause

69 * ontology:

70 * name: EDAM

71 * schema: http://edamontology.org/EDAM.owl

73 """

75 def __init__(self, input_dataset_path, output_model_path,

76 output_test_table_path=None, output_plot_path=None, properties=None, **kwargs) -> None:

77 properties = properties or {}

79 # Call parent class constructor

80 super().__init__(properties)

81 self.locals_var_dict = locals().copy()

83 # Input/Output files

84 self.io_dict = {

85 "in": {"input_dataset_path": input_dataset_path},

86 "out": {"output_model_path": output_model_path, "output_test_table_path": output_test_table_path, "output_plot_path": output_plot_path}

87 }

89 # Properties specific for BB

90 self.independent_vars = properties.get('independent_vars', {})

91 self.target = properties.get('target', {})

92 self.weight = properties.get('weight', {})

93 self.kernel = properties.get('kernel', 'rbf')

94 self.normalize_cm = properties.get('normalize_cm', False)

95 self.random_state_method = properties.get('random_state_method', 5)

96 self.random_state_train_test = properties.get('random_state_train_test', 5)

97 self.test_size = properties.get('test_size', 0.2)

98 self.scale = properties.get('scale', False)

99 self.properties = properties

100

101 # Check the properties

102 self.check_properties(properties)

103 self.check_arguments()

104

105 def check_data_params(self, out_log, err_log):

106 """ Checks all the input/output paths and parameters """

107 self.io_dict["in"]["input_dataset_path"] = check_input_path(self.io_dict["in"]["input_dataset_path"], "input_dataset_path", out_log, self.__class__.__name__)

108 self.io_dict["out"]["output_model_path"] = check_output_path(self.io_dict["out"]["output_model_path"], "output_model_path", False, out_log, self.__class__.__name__)

109 if self.io_dict["out"]["output_test_table_path"]:

110 self.io_dict["out"]["output_test_table_path"] = check_output_path(self.io_dict["out"]["output_test_table_path"], "output_test_table_path", True, out_log, self.__class__.__name__)

111 if self.io_dict["out"]["output_plot_path"]:

112 self.io_dict["out"]["output_plot_path"] = check_output_path(self.io_dict["out"]["output_plot_path"], "output_plot_path", True, out_log, self.__class__.__name__)

113

114 @launchlogger

115 def launch(self) -> int:

116 """Execute the :class:`SupportVectorMachine <classification.support_vector_machine.SupportVectorMachine>` classification.support_vector_machine.SupportVectorMachine object."""

117

118 # check input/output paths and parameters

119 self.check_data_params(self.out_log, self.err_log)

120

121 # Setup Biobb

122 if self.check_restart():

123 return 0

124 self.stage_files()

125

126 # load dataset

127 fu.log('Getting dataset from %s' % self.io_dict["in"]["input_dataset_path"], self.out_log, self.global_log)

128 if 'columns' in self.independent_vars:

129 labels = getHeader(self.io_dict["in"]["input_dataset_path"])

130 skiprows = 1

131 else:

132 labels = None

133 skiprows = None

134 data = pd.read_csv(self.io_dict["in"]["input_dataset_path"], header=None, sep="\\s+|;|:|,|\t", engine="python", skiprows=skiprows, names=labels)

135

136 # declare inputs, targets and weights

137 # the inputs are all the independent variables

138 X = getIndependentVars(self.independent_vars, data, self.out_log, self.__class__.__name__)

139 fu.log('Independent variables: [%s]' % (getIndependentVarsList(self.independent_vars)), self.out_log, self.global_log)

140 # target

141 y = getTarget(self.target, data, self.out_log, self.__class__.__name__)

142 fu.log('Target: %s' % (getTargetValue(self.target)), self.out_log, self.global_log)

143 # weights

144 if self.weight:

145 w = getWeight(self.weight, data, self.out_log, self.__class__.__name__)

146 fu.log('Weight column provided', self.out_log, self.global_log)

147

148 # train / test split

149 fu.log('Creating train and test sets', self.out_log, self.global_log)

150 arrays_sets = (X, y)

151 # if user provide weights

152 if self.weight:

153 arrays_sets = arrays_sets + (w,)

154 X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(*arrays_sets, test_size=self.test_size, random_state=self.random_state_train_test)

155 else:

156 X_train, X_test, y_train, y_test = train_test_split(*arrays_sets, test_size=self.test_size, random_state=self.random_state_train_test)

157

158 # scale dataset

159 if self.scale:

160 fu.log('Scaling dataset', self.out_log, self.global_log)

161 scaler = StandardScaler()

162 X_train = scaler.fit_transform(X_train)

163

164 # classification

165 fu.log('Training dataset applying support vector machine', self.out_log, self.global_log)

166 model = svm.SVC(kernel=self.kernel, probability=True, random_state=self.random_state_method)

167 arrays_fit = (X_train, y_train)

168 # if user provide weights

169 if self.weight:

170 arrays_fit = arrays_fit + (w_train,)

171

172 model.fit(*arrays_fit)

173

174 y_hat_train = model.predict(X_train)

175 # classification report

176 cr_train = classification_report(y_train, y_hat_train)

177 # log loss

178 yhat_prob_train = model.predict_proba(X_train)

179 l_loss_train = log_loss(y_train, yhat_prob_train)

180 fu.log('Calculating scores and report for training dataset\n\nCLASSIFICATION REPORT\n\n%s\nLog loss: %.3f\n' % (cr_train, l_loss_train), self.out_log, self.global_log)

181

182 # compute confusion matrix

183 cnf_matrix_train = confusion_matrix(y_train, y_hat_train)

184 np.set_printoptions(precision=2)

185 if self.normalize_cm:

186 cnf_matrix_train = cnf_matrix_train.astype('float') / cnf_matrix_train.sum(axis=1)[:, np.newaxis]

187 cm_type = 'NORMALIZED CONFUSION MATRIX'

188 else:

189 cm_type = 'CONFUSION MATRIX, WITHOUT NORMALIZATION'

190

191 fu.log('Calculating confusion matrix for training dataset\n\n%s\n\n%s\n' % (cm_type, cnf_matrix_train), self.out_log, self.global_log)

192

193 if self.scale:

194 X_test = scaler.transform(X_test)

195 y_hat_test = model.predict(X_test)

196 test_table = pd.DataFrame()

197 y_hat_prob = model.predict_proba(X_test)

198 y_hat_prob = np.around(y_hat_prob, decimals=2)

199 y_hat_prob = tuple(map(tuple, y_hat_prob))

200 test_table['P' + np.array2string(np.unique(y_test))] = y_hat_prob

201 y_test = y_test.reset_index(drop=True)

202 test_table['target'] = y_test

203 fu.log('Testing\n\nTEST DATA\n\n%s\n' % test_table, self.out_log, self.global_log)

204

205 # classification report

206 cr = classification_report(y_test, y_hat_test)

207 # log loss

208 yhat_prob = model.predict_proba(X_test)

209 l_loss = log_loss(y_test, yhat_prob)

210 fu.log('Calculating scores and report for testing dataset\n\nCLASSIFICATION REPORT\n\n%s\nLog loss: %.3f\n' % (cr, l_loss), self.out_log, self.global_log)

211

212 # compute confusion matrix

213 cnf_matrix = confusion_matrix(y_test, y_hat_test)

214 np.set_printoptions(precision=2)

215 if self.normalize_cm:

216 cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]

217 cm_type = 'NORMALIZED CONFUSION MATRIX'

218 else:

219 cm_type = 'CONFUSION MATRIX, WITHOUT NORMALIZATION'

220

221 fu.log('Calculating confusion matrix for testing dataset\n\n%s\n\n%s\n' % (cm_type, cnf_matrix), self.out_log, self.global_log)

222

223 if (self.io_dict["out"]["output_test_table_path"]):

224 fu.log('Saving testing data to %s' % self.io_dict["out"]["output_test_table_path"], self.out_log, self.global_log)

225 test_table.to_csv(self.io_dict["out"]["output_test_table_path"], index=False, header=True)

226

227 # plot

228 if self.io_dict["out"]["output_plot_path"]:

229 vs = y.unique().tolist()

230 vs.sort()

231 if len(vs) > 2:

232 plot = plotMultipleCM(cnf_matrix_train, cnf_matrix, self.normalize_cm, vs)

233 fu.log('Saving confusion matrix plot to %s' % self.io_dict["out"]["output_plot_path"], self.out_log, self.global_log)

234 else:

235 plot = plotBinaryClassifier(model, yhat_prob_train, yhat_prob, cnf_matrix_train, cnf_matrix, y_train, y_test, normalize=self.normalize_cm)

236 fu.log('Saving binary classifier evaluator plot to %s' % self.io_dict["out"]["output_plot_path"], self.out_log, self.global_log)

237 plot.savefig(self.io_dict["out"]["output_plot_path"], dpi=150)

238

239 # save model, scaler and parameters

240 tv = y.unique().tolist()

241 tv.sort()

242 variables = {

243 'target': self.target,

244 'independent_vars': self.independent_vars,

245 'scale': self.scale,

246 'target_values': tv

247 }

248 fu.log('Saving model to %s' % self.io_dict["out"]["output_model_path"], self.out_log, self.global_log)

249 with open(self.io_dict["out"]["output_model_path"], "wb") as f:

250 joblib.dump(model, f)

251 if self.scale:

252 joblib.dump(scaler, f)

253 joblib.dump(variables, f)

254

255 # Copy files to host

256 self.copy_to_host()

257

258 self.tmp_files.extend([

259 self.stage_io_dict.get("unique_dir")

260 ])

261 self.remove_tmp_files()

262

263 self.check_arguments(output_files_created=True, raise_exception=False)

264

265 return 0

266

267

268def support_vector_machine(input_dataset_path: str, output_model_path: str, output_test_table_path: str = None, output_plot_path: str = None, properties: dict = None, **kwargs) -> int:

269 """Execute the :class:`SupportVectorMachine <classification.support_vector_machine.SupportVectorMachine>` class and

270 execute the :meth:`launch() <classification.support_vector_machine.SupportVectorMachine.launch>` method."""

271

272 return SupportVectorMachine(input_dataset_path=input_dataset_path,

273 output_model_path=output_model_path,

274 output_test_table_path=output_test_table_path,

275 output_plot_path=output_plot_path,

276 properties=properties, **kwargs).launch()

277

278

279def main():

280 """Command line execution of this building block. Please check the command line documentation."""

281 parser = argparse.ArgumentParser(description="Wrapper of the scikit-learn SupportVectorMachine method.", formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))

282 parser.add_argument('--config', required=False, help='Configuration file')

283

284 # Specific args of each building block

285 required_args = parser.add_argument_group('required arguments')

286 required_args.add_argument('--input_dataset_path', required=True, help='Path to the input dataset. Accepted formats: csv.')

287 required_args.add_argument('--output_model_path', required=True, help='Path to the output model file. Accepted formats: pkl.')

288 parser.add_argument('--output_test_table_path', required=False, help='Path to the test table file. Accepted formats: csv.')

289 parser.add_argument('--output_plot_path', required=False, help='Path to the statistics plot. If target is binary it shows confusion matrix, distributions of the predicted probabilities of both classes and ROC curve. If target is non-binary it shows confusion matrix. Accepted formats: png.')

290

291 args = parser.parse_args()

292 args.config = args.config or "{}"

293 properties = settings.ConfReader(config=args.config).get_prop_dic()

294

295 # Specific call of each building block

296 support_vector_machine(input_dataset_path=args.input_dataset_path,

297 output_model_path=args.output_model_path,

298 output_test_table_path=args.output_test_table_path,

299 output_plot_path=args.output_plot_path,

300 properties=properties)

301

302

303if __name__ == '__main__':

304 main()

Coverage for biobb_ml/classification/support_vector_machine.py: 83%

150 statements