Coverage for biobb_dna/dna/dna

1#!/usr/bin/env python3

3"""Module containing the HelParBimodality class and the command line interface."""

4import os

5import zipfile

6import argparse

7from typing import Optional

8from pathlib import Path

10import pandas as pd

11import numpy as np

12import matplotlib.pyplot as plt

13from sklearn.mixture import GaussianMixture # type: ignore

14from biobb_dna.utils import constants

15from biobb_common.generic.biobb_object import BiobbObject

16from biobb_common.configuration import settings

17from biobb_common.tools.file_utils import launchlogger

18from biobb_dna.utils.loader import load_data

21class HelParBimodality(BiobbObject):

22 """

23 | biobb_dna HelParBimodality

24 | Determine binormality/bimodality from a helical parameter series dataset.

25 | Determine binormality/bimodality from a helical parameter series dataset.

27 Args:

28 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752).

29 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987).

30 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752).

31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579).

32 properties (dict):

33 * **helpar_name** (*str*) - (Optional) helical parameter name.

34 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage).

35 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm.

36 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm.

37 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.

38 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

39 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.1

41 Examples:

42 This is a use example of how to use the building block from Python::

44 from biobb_dna.dna.dna_bimodality import dna_bimodality

46 prop = {

47 'max_iter': 500,

48 }

49 dna_bimodality(

50 input_csv_file='filename.csv',

51 input_zip_file='/path/to/input.zip',

52 output_csv_path='/path/to/output.csv',

53 output_jpg_path='/path/to/output.jpg',

54 properties=prop)

55 Info:

56 * wrapped_software:

57 * name: In house

58 * license: Apache-2.0

59 * ontology:

60 * name: EDAM

61 * schema: http://edamontology.org/EDAM.owl

63 """

65 def __init__(self, input_csv_file, output_csv_path,

66 output_jpg_path, input_zip_file=None,

67 properties=None, **kwargs) -> None:

68 properties = properties or {}

70 # Call parent class constructor

71 super().__init__(properties)

72 self.locals_var_dict = locals().copy()

74 # Input/Output files

75 self.io_dict = {

76 'in': {

77 'input_csv_file': input_csv_file,

78 'input_zip_file': input_zip_file

79 },

80 'out': {

81 'output_csv_path': output_csv_path,

82 'output_jpg_path': output_jpg_path

83 }

84 }

86 # Properties specific for BB

87 self.confidence_level = properties.get(

88 "confidence_level", 5.0)

89 self.max_iter = properties.get(

90 "max_iter", 400)

91 self.tol = properties.get(

92 "tol", 1e-5)

93 self.helpar_name = properties.get("helpar_name", None)

94 self.properties = properties

96 # Check the properties

97 self.check_properties(properties)

98 self.check_arguments()

100 @launchlogger

101 def launch(self) -> int:

102 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object."""

103

104 # Setup Biobb

105 if self.check_restart():

106 return 0

107 self.stage_files()

108

109 # get helical parameter from filename if not specified

110 if self.helpar_name is None:

111 for hp in constants.helical_parameters:

112 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None:

113 condition_2 = (

114 hp.lower() in Path(self.stage_io_dict['in']['input_zip_file']).name.lower())

115 else:

116 condition_2 = False

117 condition_1 = hp.lower() in Path(

118 self.stage_io_dict['in']['input_csv_file']).name.lower()

119 if (condition_1 or condition_2):

120 self.helpar_name = hp

121 if self.helpar_name is None:

122 raise ValueError(

123 "Helical parameter name can't be inferred from file, "

124 "so it must be specified!")

125 else:

126 if self.helpar_name not in constants.helical_parameters:

127 raise ValueError(

128 "Helical parameter name is invalid! "

129 f"Options: {constants.helical_parameters}")

130

131 # get unit from helical parameter name

132 if self.helpar_name in constants.hp_angular:

133 self.hp_unit = "Degrees"

134 else:

135 self.hp_unit = "Angstroms"

136

137 # resolve output

138 output_csv_path = Path(

139 self.stage_io_dict['out']['output_csv_path']).resolve()

140 output_jpg_path = Path(

141 self.stage_io_dict['out']['output_jpg_path']).resolve()

142

143 # change directory to temporary folder

144 original_directory = os.getcwd()

145 os.chdir(self.stage_io_dict.get("unique_dir", ""))

146

147 # read input

148 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None:

149 # if zipfile is specified, extract to temporary folder

150 with zipfile.ZipFile(

151 self.stage_io_dict['in']['input_zip_file'],

152 'r') as zip_ref:

153 zip_ref.extractall(self.stage_io_dict.get("unique_dir", ""))

154

155 data = load_data(Path(self.stage_io_dict['in']['input_csv_file']).name)

156

157 means, variances, bics, weights = self.fit_to_model(data)

158 uninormal, binormal, insuf_ev = self.bayes_factor_criteria(

159 bics[0], bics[1])

160

161 if binormal:

162 maxm = np.argmax(means[1])

163 minm = np.argmin(means[1])

164 mean1 = means[1][minm]

165 var1 = variances[1][minm]

166 w1 = weights[1][minm]

167 mean2 = means[1][maxm]

168 var2 = variances[1][maxm]

169 w2 = weights[1][maxm]

170 bimodal = self.helguero_theorem(mean1, mean2, var1, var2)

171 else:

172 mean1 = means[0][0]

173 var1 = variances[0][0]

174 w1 = weights[0][0]

175 mean2, var2, w2 = np.nan, np.nan, 0

176 bimodal = False

177 info = dict(

178 binormal=binormal,

179 uninormal=uninormal,

180 insuf_ev=insuf_ev,

181 bimodal=bimodal,

182 mean1=mean1,

183 mean2=mean2,

184 var1=var1,

185 var2=var2,

186 w1=w1,

187 w2=w2)

188

189 # save tables

190 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path)

191

192 # make and save plot

193 data_size = len(data)

194 synth1 = np.random.normal(

195 loc=info['mean1'],

196 scale=np.sqrt(info['var1']),

197 size=int(data_size * info['w1']))

198 synth2 = np.random.normal(

199 loc=info['mean2'],

200 scale=np.sqrt(info['var2']),

201 size=int(data_size * info['w2']))

202

203 plt.figure()

204 alpha = 0.7

205 bins = 100

206 if binormal:

207 label1 = "Low State"

208 else:

209 label1 = "Single State"

210 out = plt.hist(

211 synth1, bins=bins, alpha=alpha, density=True, label=label1)

212 ylim = max(out[0]) # type: ignore

213 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed")

214 if binormal:

215 out = plt.hist(

216 synth2, bins=bins, alpha=alpha, density=True, label="high state")

217 ylim = max(out[0]) # type: ignore

218 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed")

219 plt.legend()

220 plt.ylabel("Density")

221 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})")

222 plt.title(f"Distribution of {self.helpar_name} states")

223 plt.savefig(output_jpg_path, format="jpg")

224 plt.close()

225

226 # change back to original directory

227 os.chdir(original_directory)

228

229 # Copy files to host

230 self.copy_to_host()

231

232 # Remove temporary file(s)

233 # self.tmp_files.extend([

234 # self.stage_io_dict.get("unique_dir", "")

235 # ])

236 self.remove_tmp_files()

237

238 self.check_arguments(output_files_created=True, raise_exception=False)

239

240 return 0

241

242 def fit_to_model(self, data):

243 """

244 Fit data to Gaussian Mixture models.

245 Return dictionary with distribution data.

246 """

247 means = []

248 variances = []

249 bics = []

250 weights = []

251 for n_components in (1, 2):

252 gmm = GaussianMixture(

253 n_components=n_components,

254 max_iter=self.max_iter,

255 tol=self.tol)

256 gmm = gmm.fit(data)

257 m = gmm.means_.flatten() # type: ignore

258 v = gmm.covariances_.flatten() # type: ignore

259 b = gmm.bic(data)

260 w = gmm.weights_.flatten() # type: ignore

261 means.append(m)

262 variances.append(v)

263 bics.append(b)

264 weights.append(w)

265 return means, variances, bics, weights

266

267 def bayes_factor_criteria(self, bic1, bic2):

268 diff_bic = bic2 - bic1

269 # probability of a two-component model

270 p = 1 / (1 + np.exp(0.5*diff_bic))

271 if p == np.nan:

272 if bic1 == np.nan:

273 p = 1

274 elif bic2 == np.nan:

275 p = 0

276

277 uninormal = p < (self.confidence_level / 100)

278 binormal = p > (1 - (self.confidence_level / 100))

279 insuf_ev = True if (not uninormal and not binormal) else False

280 return uninormal, binormal, insuf_ev

281

282 def helguero_theorem(self, mean1, mean2, var1, var2):

283 r = var1 / var2

284 separation_factor = np.sqrt(

285 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5

286 ) / (

287 np.sqrt(r)*(1+np.sqrt(r))

288 )

289 bimodal = abs(mean2-mean1) > separation_factor * \

290 (np.sqrt(var1) + np.sqrt(var2))

291 return bimodal

292

293

294def dna_bimodality(

295 input_csv_file, output_csv_path, output_jpg_path,

296 input_zip_file: Optional[str] = None, properties: Optional[dict] = None, **kwargs) -> int:

297 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and

298 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method."""

299

300 return HelParBimodality(

301 input_csv_file=input_csv_file,

302 input_zip_file=input_zip_file,

303 output_csv_path=output_csv_path,

304 output_jpg_path=output_jpg_path,

305 properties=properties, **kwargs).launch()

306

307 dna_bimodality.__doc__ = HelParBimodality.__doc__

308

309

310def main():

311 """Command line execution of this building block. Please check the command line documentation."""

312 parser = argparse.ArgumentParser(description='Determine binormality/bimodality from a helical parameter dataset.',

313 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))

314 parser.add_argument('--config', required=False, help='Configuration file')

315

316 required_args = parser.add_argument_group('required arguments')

317 required_args.add_argument('--input_csv_file', required=True,

318 help='Path to csv file with data. Accepted formats: csv.')

319 parser.add_argument('--input_zip_file',

320 help='Path to zip file containing csv input files. Accepted formats: zip.')

321 required_args.add_argument('--output_csv_path', required=True,

322 help='Filename and/or path of output csv file.')

323 required_args.add_argument('--output_jpg_path', required=True,

324 help='Filename and/or path of output jpg file.')

325

326 args = parser.parse_args()

327 args.config = args.config or "{}"

328 properties = settings.ConfReader(config=args.config).get_prop_dic()

329

330 dna_bimodality(

331 input_csv_file=args.input_csv_file,

332 input_zip_file=args.input_zip_file,

333 output_csv_path=args.output_csv_path,

334 output_jpg_path=args.output_jpg_path,

335 properties=properties)

336

337

338if __name__ == '__main__':

339 main()

Coverage for biobb_dna/dna/dna_bimodality.py: 68%

152 statements