Coverage for biobb_dna / dna / dna_bimodality.py: 74%

139 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-22 13:05 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the HelParBimodality class and the command line interface.""" 

4import os 

5import zipfile 

6from typing import Optional 

7from pathlib import Path 

8 

9import pandas as pd 

10import numpy as np 

11import matplotlib.pyplot as plt 

12from sklearn.mixture import GaussianMixture # type: ignore 

13from biobb_dna.utils import constants 

14from biobb_common.generic.biobb_object import BiobbObject 

15from biobb_common.tools.file_utils import launchlogger 

16from biobb_dna.utils.loader import load_data 

17 

18 

19class HelParBimodality(BiobbObject): 

20 """ 

21 | biobb_dna HelParBimodality 

22 | Determine binormality/bimodality from a helical parameter series dataset. 

23 | Determine binormality/bimodality from a helical parameter series dataset. 

24 

25 Args: 

26 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752). 

27 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987). 

28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752). 

29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579). 

30 properties (dict): 

31 * **helpar_name** (*str*) - (Optional) helical parameter name. 

32 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage). 

33 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm. 

34 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm. 

35 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

36 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

37 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.1 

38 

39 Examples: 

40 This is a use example of how to use the building block from Python:: 

41 

42 from biobb_dna.dna.dna_bimodality import dna_bimodality 

43 

44 prop = { 

45 'max_iter': 500, 

46 } 

47 dna_bimodality( 

48 input_csv_file='filename.csv', 

49 input_zip_file='/path/to/input.zip', 

50 output_csv_path='/path/to/output.csv', 

51 output_jpg_path='/path/to/output.jpg', 

52 properties=prop) 

53 Info: 

54 * wrapped_software: 

55 * name: In house 

56 * license: Apache-2.0 

57 * ontology: 

58 * name: EDAM 

59 * schema: http://edamontology.org/EDAM.owl 

60 

61 """ 

62 

63 def __init__(self, input_csv_file, output_csv_path, 

64 output_jpg_path, input_zip_file=None, 

65 properties=None, **kwargs) -> None: 

66 properties = properties or {} 

67 

68 # Call parent class constructor 

69 super().__init__(properties) 

70 self.locals_var_dict = locals().copy() 

71 

72 # Input/Output files 

73 self.io_dict = { 

74 'in': { 

75 'input_csv_file': input_csv_file, 

76 'input_zip_file': input_zip_file 

77 }, 

78 'out': { 

79 'output_csv_path': output_csv_path, 

80 'output_jpg_path': output_jpg_path 

81 } 

82 } 

83 

84 # Properties specific for BB 

85 self.confidence_level = properties.get( 

86 "confidence_level", 5.0) 

87 self.max_iter = properties.get( 

88 "max_iter", 400) 

89 self.tol = properties.get( 

90 "tol", 1e-5) 

91 self.helpar_name = properties.get("helpar_name", None) 

92 self.properties = properties 

93 

94 # Check the properties 

95 self.check_properties(properties) 

96 self.check_arguments() 

97 

98 @launchlogger 

99 def launch(self) -> int: 

100 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object.""" 

101 

102 # Setup Biobb 

103 if self.check_restart(): 

104 return 0 

105 self.stage_files() 

106 

107 # get helical parameter from filename if not specified 

108 if self.helpar_name is None: 

109 for hp in constants.helical_parameters: 

110 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None: 

111 condition_2 = ( 

112 hp.lower() in Path(self.stage_io_dict['in']['input_zip_file']).name.lower()) 

113 else: 

114 condition_2 = False 

115 condition_1 = hp.lower() in Path( 

116 self.stage_io_dict['in']['input_csv_file']).name.lower() 

117 if (condition_1 or condition_2): 

118 self.helpar_name = hp 

119 if self.helpar_name is None: 

120 raise ValueError( 

121 "Helical parameter name can't be inferred from file, " 

122 "so it must be specified!") 

123 else: 

124 if self.helpar_name not in constants.helical_parameters: 

125 raise ValueError( 

126 "Helical parameter name is invalid! " 

127 f"Options: {constants.helical_parameters}") 

128 

129 # get unit from helical parameter name 

130 if self.helpar_name in constants.hp_angular: 

131 self.hp_unit = "Degrees" 

132 else: 

133 self.hp_unit = "Angstroms" 

134 

135 # resolve output 

136 output_csv_path = Path( 

137 self.stage_io_dict['out']['output_csv_path']).resolve() 

138 output_jpg_path = Path( 

139 self.stage_io_dict['out']['output_jpg_path']).resolve() 

140 

141 # change directory to temporary folder 

142 original_directory = os.getcwd() 

143 os.chdir(self.stage_io_dict.get("unique_dir", "")) 

144 

145 # read input 

146 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None: 

147 # if zipfile is specified, extract to temporary folder 

148 with zipfile.ZipFile( 

149 self.stage_io_dict['in']['input_zip_file'], 

150 'r') as zip_ref: 

151 zip_ref.extractall(self.stage_io_dict.get("unique_dir", "")) 

152 

153 data = load_data(Path(self.stage_io_dict['in']['input_csv_file']).name) 

154 

155 means, variances, bics, weights = self.fit_to_model(data) 

156 uninormal, binormal, insuf_ev = self.bayes_factor_criteria( 

157 bics[0], bics[1]) 

158 

159 if binormal: 

160 maxm = np.argmax(means[1]) 

161 minm = np.argmin(means[1]) 

162 mean1 = means[1][minm] 

163 var1 = variances[1][minm] 

164 w1 = weights[1][minm] 

165 mean2 = means[1][maxm] 

166 var2 = variances[1][maxm] 

167 w2 = weights[1][maxm] 

168 bimodal = self.helguero_theorem(mean1, mean2, var1, var2) 

169 else: 

170 mean1 = means[0][0] 

171 var1 = variances[0][0] 

172 w1 = weights[0][0] 

173 mean2, var2, w2 = np.nan, np.nan, 0 

174 bimodal = False 

175 info = dict( 

176 binormal=binormal, 

177 uninormal=uninormal, 

178 insuf_ev=insuf_ev, 

179 bimodal=bimodal, 

180 mean1=mean1, 

181 mean2=mean2, 

182 var1=var1, 

183 var2=var2, 

184 w1=w1, 

185 w2=w2) 

186 

187 # save tables 

188 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path) 

189 

190 # make and save plot 

191 data_size = len(data) 

192 synth1 = np.random.normal( 

193 loc=info['mean1'], 

194 scale=np.sqrt(info['var1']), 

195 size=int(data_size * info['w1'])) 

196 synth2 = np.random.normal( 

197 loc=info['mean2'], 

198 scale=np.sqrt(info['var2']), 

199 size=int(data_size * info['w2'])) 

200 

201 plt.figure() 

202 alpha = 0.7 

203 bins = 100 

204 if binormal: 

205 label1 = "Low State" 

206 else: 

207 label1 = "Single State" 

208 out = plt.hist( 

209 synth1, bins=bins, alpha=alpha, density=True, label=label1) 

210 ylim = max(out[0]) # type: ignore 

211 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed") 

212 if binormal: 

213 out = plt.hist( 

214 synth2, bins=bins, alpha=alpha, density=True, label="high state") 

215 ylim = max(out[0]) # type: ignore 

216 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed") 

217 plt.legend() 

218 plt.ylabel("Density") 

219 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})") 

220 plt.title(f"Distribution of {self.helpar_name} states") 

221 plt.savefig(output_jpg_path, format="jpg") 

222 plt.close() 

223 

224 # change back to original directory 

225 os.chdir(original_directory) 

226 

227 # Copy files to host 

228 self.copy_to_host() 

229 

230 # Remove temporary file(s) 

231 self.remove_tmp_files() 

232 

233 self.check_arguments(output_files_created=True, raise_exception=False) 

234 

235 return 0 

236 

237 def fit_to_model(self, data): 

238 """ 

239 Fit data to Gaussian Mixture models. 

240 Return dictionary with distribution data. 

241 """ 

242 means = [] 

243 variances = [] 

244 bics = [] 

245 weights = [] 

246 for n_components in (1, 2): 

247 gmm = GaussianMixture( 

248 n_components=n_components, 

249 max_iter=self.max_iter, 

250 tol=self.tol) 

251 gmm = gmm.fit(data) 

252 m = gmm.means_.flatten() # type: ignore 

253 v = gmm.covariances_.flatten() # type: ignore 

254 b = gmm.bic(data) 

255 w = gmm.weights_.flatten() # type: ignore 

256 means.append(m) 

257 variances.append(v) 

258 bics.append(b) 

259 weights.append(w) 

260 return means, variances, bics, weights 

261 

262 def bayes_factor_criteria(self, bic1, bic2): 

263 diff_bic = bic2 - bic1 

264 # probability of a two-component model 

265 p = 1 / (1 + np.exp(0.5*diff_bic)) 

266 if p == np.nan: 

267 if bic1 == np.nan: 

268 p = 1 

269 elif bic2 == np.nan: 

270 p = 0 

271 

272 uninormal = p < (self.confidence_level / 100) 

273 binormal = p > (1 - (self.confidence_level / 100)) 

274 insuf_ev = True if (not uninormal and not binormal) else False 

275 return uninormal, binormal, insuf_ev 

276 

277 def helguero_theorem(self, mean1, mean2, var1, var2): 

278 r = var1 / var2 

279 separation_factor = np.sqrt( 

280 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5 

281 ) / ( 

282 np.sqrt(r)*(1+np.sqrt(r)) 

283 ) 

284 bimodal = abs(mean2-mean1) > separation_factor * \ 

285 (np.sqrt(var1) + np.sqrt(var2)) 

286 return bimodal 

287 

288 

289def dna_bimodality( 

290 input_csv_file, output_csv_path, output_jpg_path, 

291 input_zip_file: Optional[str] = None, properties: Optional[dict] = None, **kwargs) -> int: 

292 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and 

293 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method.""" 

294 return HelParBimodality(**dict(locals())).launch() 

295 

296 

297dna_bimodality.__doc__ = HelParBimodality.__doc__ 

298main = HelParBimodality.get_main(dna_bimodality, "Determine binormality/bimodality from a helical parameter dataset.") 

299 

300if __name__ == '__main__': 

301 main()