Coverage for biobb_dna/dna/dna_bimodality.py: 68%

152 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 10:36 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the HelParBimodality class and the command line interface.""" 

4import os 

5import zipfile 

6import argparse 

7from typing import Optional 

8from pathlib import Path 

9 

10import pandas as pd 

11import numpy as np 

12import matplotlib.pyplot as plt 

13from sklearn.mixture import GaussianMixture # type: ignore 

14from biobb_dna.utils import constants 

15from biobb_common.generic.biobb_object import BiobbObject 

16from biobb_common.configuration import settings 

17from biobb_common.tools.file_utils import launchlogger 

18from biobb_dna.utils.loader import load_data 

19 

20 

21class HelParBimodality(BiobbObject): 

22 """ 

23 | biobb_dna HelParBimodality 

24 | Determine binormality/bimodality from a helical parameter series dataset. 

25 | Determine binormality/bimodality from a helical parameter series dataset. 

26 

27 Args: 

28 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752). 

29 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987). 

30 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752). 

31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579). 

32 properties (dict): 

33 * **helpar_name** (*str*) - (Optional) helical parameter name. 

34 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage). 

35 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm. 

36 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm. 

37 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

38 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

39 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.1 

40 

41 Examples: 

42 This is a use example of how to use the building block from Python:: 

43 

44 from biobb_dna.dna.dna_bimodality import dna_bimodality 

45 

46 prop = { 

47 'max_iter': 500, 

48 } 

49 dna_bimodality( 

50 input_csv_file='filename.csv', 

51 input_zip_file='/path/to/input.zip', 

52 output_csv_path='/path/to/output.csv', 

53 output_jpg_path='/path/to/output.jpg', 

54 properties=prop) 

55 Info: 

56 * wrapped_software: 

57 * name: In house 

58 * license: Apache-2.0 

59 * ontology: 

60 * name: EDAM 

61 * schema: http://edamontology.org/EDAM.owl 

62 

63 """ 

64 

65 def __init__(self, input_csv_file, output_csv_path, 

66 output_jpg_path, input_zip_file=None, 

67 properties=None, **kwargs) -> None: 

68 properties = properties or {} 

69 

70 # Call parent class constructor 

71 super().__init__(properties) 

72 self.locals_var_dict = locals().copy() 

73 

74 # Input/Output files 

75 self.io_dict = { 

76 'in': { 

77 'input_csv_file': input_csv_file, 

78 'input_zip_file': input_zip_file 

79 }, 

80 'out': { 

81 'output_csv_path': output_csv_path, 

82 'output_jpg_path': output_jpg_path 

83 } 

84 } 

85 

86 # Properties specific for BB 

87 self.confidence_level = properties.get( 

88 "confidence_level", 5.0) 

89 self.max_iter = properties.get( 

90 "max_iter", 400) 

91 self.tol = properties.get( 

92 "tol", 1e-5) 

93 self.helpar_name = properties.get("helpar_name", None) 

94 self.properties = properties 

95 

96 # Check the properties 

97 self.check_properties(properties) 

98 self.check_arguments() 

99 

100 @launchlogger 

101 def launch(self) -> int: 

102 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object.""" 

103 

104 # Setup Biobb 

105 if self.check_restart(): 

106 return 0 

107 self.stage_files() 

108 

109 # get helical parameter from filename if not specified 

110 if self.helpar_name is None: 

111 for hp in constants.helical_parameters: 

112 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None: 

113 condition_2 = ( 

114 hp.lower() in Path(self.stage_io_dict['in']['input_zip_file']).name.lower()) 

115 else: 

116 condition_2 = False 

117 condition_1 = hp.lower() in Path( 

118 self.stage_io_dict['in']['input_csv_file']).name.lower() 

119 if (condition_1 or condition_2): 

120 self.helpar_name = hp 

121 if self.helpar_name is None: 

122 raise ValueError( 

123 "Helical parameter name can't be inferred from file, " 

124 "so it must be specified!") 

125 else: 

126 if self.helpar_name not in constants.helical_parameters: 

127 raise ValueError( 

128 "Helical parameter name is invalid! " 

129 f"Options: {constants.helical_parameters}") 

130 

131 # get unit from helical parameter name 

132 if self.helpar_name in constants.hp_angular: 

133 self.hp_unit = "Degrees" 

134 else: 

135 self.hp_unit = "Angstroms" 

136 

137 # resolve output 

138 output_csv_path = Path( 

139 self.stage_io_dict['out']['output_csv_path']).resolve() 

140 output_jpg_path = Path( 

141 self.stage_io_dict['out']['output_jpg_path']).resolve() 

142 

143 # change directory to temporary folder 

144 original_directory = os.getcwd() 

145 os.chdir(self.stage_io_dict.get("unique_dir", "")) 

146 

147 # read input 

148 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None: 

149 # if zipfile is specified, extract to temporary folder 

150 with zipfile.ZipFile( 

151 self.stage_io_dict['in']['input_zip_file'], 

152 'r') as zip_ref: 

153 zip_ref.extractall(self.stage_io_dict.get("unique_dir", "")) 

154 

155 data = load_data(Path(self.stage_io_dict['in']['input_csv_file']).name) 

156 

157 means, variances, bics, weights = self.fit_to_model(data) 

158 uninormal, binormal, insuf_ev = self.bayes_factor_criteria( 

159 bics[0], bics[1]) 

160 

161 if binormal: 

162 maxm = np.argmax(means[1]) 

163 minm = np.argmin(means[1]) 

164 mean1 = means[1][minm] 

165 var1 = variances[1][minm] 

166 w1 = weights[1][minm] 

167 mean2 = means[1][maxm] 

168 var2 = variances[1][maxm] 

169 w2 = weights[1][maxm] 

170 bimodal = self.helguero_theorem(mean1, mean2, var1, var2) 

171 else: 

172 mean1 = means[0][0] 

173 var1 = variances[0][0] 

174 w1 = weights[0][0] 

175 mean2, var2, w2 = np.nan, np.nan, 0 

176 bimodal = False 

177 info = dict( 

178 binormal=binormal, 

179 uninormal=uninormal, 

180 insuf_ev=insuf_ev, 

181 bimodal=bimodal, 

182 mean1=mean1, 

183 mean2=mean2, 

184 var1=var1, 

185 var2=var2, 

186 w1=w1, 

187 w2=w2) 

188 

189 # save tables 

190 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path) 

191 

192 # make and save plot 

193 data_size = len(data) 

194 synth1 = np.random.normal( 

195 loc=info['mean1'], 

196 scale=np.sqrt(info['var1']), 

197 size=int(data_size * info['w1'])) 

198 synth2 = np.random.normal( 

199 loc=info['mean2'], 

200 scale=np.sqrt(info['var2']), 

201 size=int(data_size * info['w2'])) 

202 

203 plt.figure() 

204 alpha = 0.7 

205 bins = 100 

206 if binormal: 

207 label1 = "Low State" 

208 else: 

209 label1 = "Single State" 

210 out = plt.hist( 

211 synth1, bins=bins, alpha=alpha, density=True, label=label1) 

212 ylim = max(out[0]) # type: ignore 

213 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed") 

214 if binormal: 

215 out = plt.hist( 

216 synth2, bins=bins, alpha=alpha, density=True, label="high state") 

217 ylim = max(out[0]) # type: ignore 

218 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed") 

219 plt.legend() 

220 plt.ylabel("Density") 

221 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})") 

222 plt.title(f"Distribution of {self.helpar_name} states") 

223 plt.savefig(output_jpg_path, format="jpg") 

224 plt.close() 

225 

226 # change back to original directory 

227 os.chdir(original_directory) 

228 

229 # Copy files to host 

230 self.copy_to_host() 

231 

232 # Remove temporary file(s) 

233 # self.tmp_files.extend([ 

234 # self.stage_io_dict.get("unique_dir", "") 

235 # ]) 

236 self.remove_tmp_files() 

237 

238 self.check_arguments(output_files_created=True, raise_exception=False) 

239 

240 return 0 

241 

242 def fit_to_model(self, data): 

243 """ 

244 Fit data to Gaussian Mixture models. 

245 Return dictionary with distribution data. 

246 """ 

247 means = [] 

248 variances = [] 

249 bics = [] 

250 weights = [] 

251 for n_components in (1, 2): 

252 gmm = GaussianMixture( 

253 n_components=n_components, 

254 max_iter=self.max_iter, 

255 tol=self.tol) 

256 gmm = gmm.fit(data) 

257 m = gmm.means_.flatten() # type: ignore 

258 v = gmm.covariances_.flatten() # type: ignore 

259 b = gmm.bic(data) 

260 w = gmm.weights_.flatten() # type: ignore 

261 means.append(m) 

262 variances.append(v) 

263 bics.append(b) 

264 weights.append(w) 

265 return means, variances, bics, weights 

266 

267 def bayes_factor_criteria(self, bic1, bic2): 

268 diff_bic = bic2 - bic1 

269 # probability of a two-component model 

270 p = 1 / (1 + np.exp(0.5*diff_bic)) 

271 if p == np.nan: 

272 if bic1 == np.nan: 

273 p = 1 

274 elif bic2 == np.nan: 

275 p = 0 

276 

277 uninormal = p < (self.confidence_level / 100) 

278 binormal = p > (1 - (self.confidence_level / 100)) 

279 insuf_ev = True if (not uninormal and not binormal) else False 

280 return uninormal, binormal, insuf_ev 

281 

282 def helguero_theorem(self, mean1, mean2, var1, var2): 

283 r = var1 / var2 

284 separation_factor = np.sqrt( 

285 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5 

286 ) / ( 

287 np.sqrt(r)*(1+np.sqrt(r)) 

288 ) 

289 bimodal = abs(mean2-mean1) > separation_factor * \ 

290 (np.sqrt(var1) + np.sqrt(var2)) 

291 return bimodal 

292 

293 

294def dna_bimodality( 

295 input_csv_file, output_csv_path, output_jpg_path, 

296 input_zip_file: Optional[str] = None, properties: Optional[dict] = None, **kwargs) -> int: 

297 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and 

298 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method.""" 

299 

300 return HelParBimodality( 

301 input_csv_file=input_csv_file, 

302 input_zip_file=input_zip_file, 

303 output_csv_path=output_csv_path, 

304 output_jpg_path=output_jpg_path, 

305 properties=properties, **kwargs).launch() 

306 

307 dna_bimodality.__doc__ = HelParBimodality.__doc__ 

308 

309 

310def main(): 

311 """Command line execution of this building block. Please check the command line documentation.""" 

312 parser = argparse.ArgumentParser(description='Determine binormality/bimodality from a helical parameter dataset.', 

313 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

314 parser.add_argument('--config', required=False, help='Configuration file') 

315 

316 required_args = parser.add_argument_group('required arguments') 

317 required_args.add_argument('--input_csv_file', required=True, 

318 help='Path to csv file with data. Accepted formats: csv.') 

319 parser.add_argument('--input_zip_file', 

320 help='Path to zip file containing csv input files. Accepted formats: zip.') 

321 required_args.add_argument('--output_csv_path', required=True, 

322 help='Filename and/or path of output csv file.') 

323 required_args.add_argument('--output_jpg_path', required=True, 

324 help='Filename and/or path of output jpg file.') 

325 

326 args = parser.parse_args() 

327 args.config = args.config or "{}" 

328 properties = settings.ConfReader(config=args.config).get_prop_dic() 

329 

330 dna_bimodality( 

331 input_csv_file=args.input_csv_file, 

332 input_zip_file=args.input_zip_file, 

333 output_csv_path=args.output_csv_path, 

334 output_jpg_path=args.output_jpg_path, 

335 properties=properties) 

336 

337 

338if __name__ == '__main__': 

339 main()