Coverage for biobb_dna/dna/dna_bimodality.py: 69%

158 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:06 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the HelParBimodality class and the command line interface.""" 

4import os 

5import shutil 

6import zipfile 

7import argparse 

8from pathlib import Path 

9 

10import pandas as pd 

11import numpy as np 

12import matplotlib.pyplot as plt 

13from sklearn.mixture import GaussianMixture 

14 

15from biobb_dna.utils import constants 

16from biobb_common.generic.biobb_object import BiobbObject 

17from biobb_common.configuration import settings 

18from biobb_common.tools import file_utils as fu 

19from biobb_common.tools.file_utils import launchlogger 

20from biobb_dna.utils.loader import load_data 

21 

22 

23class HelParBimodality(BiobbObject): 

24 """ 

25 | biobb_dna HelParBimodality 

26 | Determine binormality/bimodality from a helical parameter series dataset. 

27 

28 Args: 

29 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752). 

30 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987). 

31 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752). 

32 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579). 

33 properties (dict): 

34 * **helpar_name** (*str*) - (Optional) helical parameter name. 

35 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage). 

36 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm. 

37 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm. 

38 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

39 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.1 

40 

41 Examples: 

42 This is a use example of how to use the building block from Python:: 

43 

44 from biobb_dna.dna.dna_bimodality import dna_bimodality 

45 

46 prop = { 

47 'max_iter': 500, 

48 } 

49 dna_bimodality( 

50 input_csv_file='filename.csv', 

51 input_zip_file='/path/to/input.zip', 

52 output_csv_path='/path/to/output.csv', 

53 output_jpg_path='/path/to/output.jpg', 

54 properties=prop) 

55 Info: 

56 * wrapped_software: 

57 * name: In house 

58 * license: Apache-2.0 

59 * ontology: 

60 * name: EDAM 

61 * schema: http://edamontology.org/EDAM.owl 

62 

63 """ 

64 

65 def __init__(self, input_csv_file, output_csv_path, 

66 output_jpg_path, input_zip_file=None, 

67 properties=None, **kwargs) -> None: 

68 properties = properties or {} 

69 

70 # Call parent class constructor 

71 super().__init__(properties) 

72 self.locals_var_dict = locals().copy() 

73 

74 # Input/Output files 

75 self.io_dict = { 

76 'in': { 

77 'input_csv_file': input_csv_file, 

78 'input_zip_file': input_zip_file 

79 }, 

80 'out': { 

81 'output_csv_path': output_csv_path, 

82 'output_jpg_path': output_jpg_path 

83 } 

84 } 

85 

86 # Properties specific for BB 

87 self.confidence_level = properties.get( 

88 "confidence_level", 5.0) 

89 self.max_iter = properties.get( 

90 "max_iter", 400) 

91 self.tol = properties.get( 

92 "tol", 1e-5) 

93 self.helpar_name = properties.get("helpar_name", None) 

94 self.properties = properties 

95 

96 # Check the properties 

97 self.check_properties(properties) 

98 self.check_arguments() 

99 

100 @launchlogger 

101 def launch(self) -> int: 

102 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object.""" 

103 

104 # Setup Biobb 

105 if self.check_restart(): 

106 return 0 

107 self.stage_files() 

108 

109 # get helical parameter from filename if not specified 

110 if self.helpar_name is None: 

111 for hp in constants.helical_parameters: 

112 input_zip_path = self.io_dict['in']['input_zip_file'] 

113 if input_zip_path is not None: 

114 condition_2 = ( 

115 hp.lower() in Path(input_zip_path).name.lower()) 

116 else: 

117 condition_2 = False 

118 condition_1 = hp.lower() in Path( 

119 self.io_dict['in']['input_csv_file']).name.lower() 

120 if (condition_1 or condition_2): 

121 self.helpar_name = hp 

122 if self.helpar_name is None: 

123 raise ValueError( 

124 "Helical parameter name can't be inferred from file, " 

125 "so it must be specified!") 

126 else: 

127 if self.helpar_name not in constants.helical_parameters: 

128 raise ValueError( 

129 "Helical parameter name is invalid! " 

130 f"Options: {constants.helical_parameters}") 

131 

132 # get unit from helical parameter name 

133 if self.helpar_name in constants.hp_angular: 

134 self.hp_unit = "Degrees" 

135 else: 

136 self.hp_unit = "Angstroms" 

137 

138 # resolve output 

139 output_csv_path = Path( 

140 self.io_dict['out']['output_csv_path']).resolve() 

141 output_jpg_path = Path( 

142 self.io_dict['out']['output_jpg_path']).resolve() 

143 

144 # Creating temporary folder 

145 self.tmp_folder = fu.create_unique_dir(prefix="bimodality_") 

146 fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log) 

147 

148 # read input 

149 if self.io_dict['in']['input_zip_file'] is not None: 

150 # Copy input_file_path1 to temporary folder 

151 shutil.copy(self.io_dict['in']['input_zip_file'], self.tmp_folder) 

152 # if zipfile is specified, extract to temporary folder 

153 with zipfile.ZipFile( 

154 self.io_dict['in']['input_zip_file'], 

155 'r') as zip_ref: 

156 zip_ref.extractall(self.tmp_folder) 

157 else: 

158 # copy input files to temporary folder 

159 shutil.copy( 

160 self.io_dict['in']['input_csv_file'], 

161 self.tmp_folder) 

162 # change directory to temporary folder 

163 original_directory = os.getcwd() 

164 os.chdir(self.tmp_folder) 

165 data = load_data(Path(self.io_dict['in']['input_csv_file']).name) 

166 

167 means, variances, bics, weights = self.fit_to_model(data) 

168 uninormal, binormal, insuf_ev = self.bayes_factor_criteria( 

169 bics[0], bics[1]) 

170 

171 if binormal: 

172 maxm = np.argmax(means[1]) 

173 minm = np.argmin(means[1]) 

174 mean1 = means[1][minm] 

175 var1 = variances[1][minm] 

176 w1 = weights[1][minm] 

177 mean2 = means[1][maxm] 

178 var2 = variances[1][maxm] 

179 w2 = weights[1][maxm] 

180 bimodal = self.helguero_theorem(mean1, mean2, var1, var2) 

181 else: 

182 mean1 = means[0][0] 

183 var1 = variances[0][0] 

184 w1 = weights[0][0] 

185 mean2, var2, w2 = np.nan, np.nan, 0 

186 bimodal = False 

187 info = dict( 

188 binormal=binormal, 

189 uninormal=uninormal, 

190 insuf_ev=insuf_ev, 

191 bimodal=bimodal, 

192 mean1=mean1, 

193 mean2=mean2, 

194 var1=var1, 

195 var2=var2, 

196 w1=w1, 

197 w2=w2) 

198 

199 # save tables 

200 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path) 

201 

202 # make and save plot 

203 data_size = len(data) 

204 synth1 = np.random.normal( 

205 loc=info['mean1'], 

206 scale=np.sqrt(info['var1']), 

207 size=int(data_size * info['w1'])) 

208 synth2 = np.random.normal( 

209 loc=info['mean2'], 

210 scale=np.sqrt(info['var2']), 

211 size=int(data_size * info['w2'])) 

212 

213 plt.figure() 

214 alpha = 0.7 

215 bins = 100 

216 if binormal: 

217 label1 = "Low State" 

218 else: 

219 label1 = "Single State" 

220 out = plt.hist( 

221 synth1, bins=bins, alpha=alpha, density=True, label=label1) 

222 ylim = max(out[0]) 

223 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed") 

224 if binormal: 

225 out = plt.hist( 

226 synth2, bins=bins, alpha=alpha, density=True, label="high state") 

227 ylim = max(out[0]) 

228 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed") 

229 plt.legend() 

230 plt.ylabel("Density") 

231 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})") 

232 plt.title(f"Distribution of {self.helpar_name} states") 

233 plt.savefig(output_jpg_path, format="jpg") 

234 plt.close() 

235 

236 # change back to original directory 

237 os.chdir(original_directory) 

238 

239 # Remove temporary file(s) 

240 self.tmp_files.extend([ 

241 self.stage_io_dict.get("unique_dir"), 

242 self.tmp_folder 

243 ]) 

244 self.remove_tmp_files() 

245 

246 self.check_arguments(output_files_created=True, raise_exception=False) 

247 

248 return 0 

249 

250 def fit_to_model(self, data): 

251 """ 

252 Fit data to Gaussian Mixture models. 

253 Return dictionary with distribution data. 

254 """ 

255 means = [] 

256 variances = [] 

257 bics = [] 

258 weights = [] 

259 for n_components in (1, 2): 

260 gmm = GaussianMixture( 

261 n_components=n_components, 

262 max_iter=self.max_iter, 

263 tol=self.tol) 

264 gmm = gmm.fit(data) 

265 m = gmm.means_.flatten() 

266 v = gmm.covariances_.flatten() 

267 b = gmm.bic(data) 

268 w = gmm.weights_.flatten() 

269 means.append(m) 

270 variances.append(v) 

271 bics.append(b) 

272 weights.append(w) 

273 return means, variances, bics, weights 

274 

275 def bayes_factor_criteria(self, bic1, bic2): 

276 diff_bic = bic2 - bic1 

277 # probability of a two-component model 

278 p = 1 / (1 + np.exp(0.5*diff_bic)) 

279 if p == np.nan: 

280 if bic1 == np.nan: 

281 p = 1 

282 elif bic2 == np.nan: 

283 p = 0 

284 

285 uninormal = p < (self.confidence_level / 100) 

286 binormal = p > (1 - (self.confidence_level / 100)) 

287 insuf_ev = True if (not uninormal and not binormal) else False 

288 return uninormal, binormal, insuf_ev 

289 

290 def helguero_theorem(self, mean1, mean2, var1, var2): 

291 r = var1 / var2 

292 separation_factor = np.sqrt( 

293 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5 

294 ) / ( 

295 np.sqrt(r)*(1+np.sqrt(r)) 

296 ) 

297 bimodal = abs(mean2-mean1) > separation_factor * \ 

298 (np.sqrt(var1) + np.sqrt(var2)) 

299 return bimodal 

300 

301 

302def dna_bimodality( 

303 input_csv_file, output_csv_path, output_jpg_path, 

304 input_zip_file: str = None, properties: dict = None, **kwargs) -> int: 

305 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and 

306 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method.""" 

307 

308 return HelParBimodality( 

309 input_csv_file=input_csv_file, 

310 input_zip_file=input_zip_file, 

311 output_csv_path=output_csv_path, 

312 output_jpg_path=output_jpg_path, 

313 properties=properties, **kwargs).launch() 

314 

315 

316def main(): 

317 """Command line execution of this building block. Please check the command line documentation.""" 

318 parser = argparse.ArgumentParser(description='Determine binormality/bimodality from a helical parameter dataset.', 

319 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

320 parser.add_argument('--config', required=False, help='Configuration file') 

321 

322 required_args = parser.add_argument_group('required arguments') 

323 required_args.add_argument('--input_csv_file', required=True, 

324 help='Path to csv file with data. Accepted formats: csv.') 

325 parser.add_argument('--input_zip_file', 

326 help='Path to zip file containing csv input files. Accepted formats: zip.') 

327 required_args.add_argument('--output_csv_path', required=True, 

328 help='Filename and/or path of output csv file.') 

329 required_args.add_argument('--output_jpg_path', required=True, 

330 help='Filename and/or path of output jpg file.') 

331 

332 args = parser.parse_args() 

333 args.config = args.config or "{}" 

334 properties = settings.ConfReader(config=args.config).get_prop_dic() 

335 

336 dna_bimodality( 

337 input_csv_file=args.input_csv_file, 

338 input_zip_file=args.input_zip_file, 

339 output_csv_path=args.output_csv_path, 

340 output_jpg_path=args.output_jpg_path, 

341 properties=properties) 

342 

343 

344if __name__ == '__main__': 

345 main()