Coverage for biobb_dna/dna/dna_bimodality.py: 69%
158 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:06 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:06 +0000
1#!/usr/bin/env python3
3"""Module containing the HelParBimodality class and the command line interface."""
4import os
5import shutil
6import zipfile
7import argparse
8from pathlib import Path
10import pandas as pd
11import numpy as np
12import matplotlib.pyplot as plt
13from sklearn.mixture import GaussianMixture
15from biobb_dna.utils import constants
16from biobb_common.generic.biobb_object import BiobbObject
17from biobb_common.configuration import settings
18from biobb_common.tools import file_utils as fu
19from biobb_common.tools.file_utils import launchlogger
20from biobb_dna.utils.loader import load_data
23class HelParBimodality(BiobbObject):
24 """
25 | biobb_dna HelParBimodality
26 | Determine binormality/bimodality from a helical parameter series dataset.
28 Args:
29 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752).
30 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987).
31 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752).
32 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579).
33 properties (dict):
34 * **helpar_name** (*str*) - (Optional) helical parameter name.
35 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage).
36 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm.
37 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm.
38 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
39 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.1
41 Examples:
42 This is a use example of how to use the building block from Python::
44 from biobb_dna.dna.dna_bimodality import dna_bimodality
46 prop = {
47 'max_iter': 500,
48 }
49 dna_bimodality(
50 input_csv_file='filename.csv',
51 input_zip_file='/path/to/input.zip',
52 output_csv_path='/path/to/output.csv',
53 output_jpg_path='/path/to/output.jpg',
54 properties=prop)
55 Info:
56 * wrapped_software:
57 * name: In house
58 * license: Apache-2.0
59 * ontology:
60 * name: EDAM
61 * schema: http://edamontology.org/EDAM.owl
63 """
65 def __init__(self, input_csv_file, output_csv_path,
66 output_jpg_path, input_zip_file=None,
67 properties=None, **kwargs) -> None:
68 properties = properties or {}
70 # Call parent class constructor
71 super().__init__(properties)
72 self.locals_var_dict = locals().copy()
74 # Input/Output files
75 self.io_dict = {
76 'in': {
77 'input_csv_file': input_csv_file,
78 'input_zip_file': input_zip_file
79 },
80 'out': {
81 'output_csv_path': output_csv_path,
82 'output_jpg_path': output_jpg_path
83 }
84 }
86 # Properties specific for BB
87 self.confidence_level = properties.get(
88 "confidence_level", 5.0)
89 self.max_iter = properties.get(
90 "max_iter", 400)
91 self.tol = properties.get(
92 "tol", 1e-5)
93 self.helpar_name = properties.get("helpar_name", None)
94 self.properties = properties
96 # Check the properties
97 self.check_properties(properties)
98 self.check_arguments()
100 @launchlogger
101 def launch(self) -> int:
102 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object."""
104 # Setup Biobb
105 if self.check_restart():
106 return 0
107 self.stage_files()
109 # get helical parameter from filename if not specified
110 if self.helpar_name is None:
111 for hp in constants.helical_parameters:
112 input_zip_path = self.io_dict['in']['input_zip_file']
113 if input_zip_path is not None:
114 condition_2 = (
115 hp.lower() in Path(input_zip_path).name.lower())
116 else:
117 condition_2 = False
118 condition_1 = hp.lower() in Path(
119 self.io_dict['in']['input_csv_file']).name.lower()
120 if (condition_1 or condition_2):
121 self.helpar_name = hp
122 if self.helpar_name is None:
123 raise ValueError(
124 "Helical parameter name can't be inferred from file, "
125 "so it must be specified!")
126 else:
127 if self.helpar_name not in constants.helical_parameters:
128 raise ValueError(
129 "Helical parameter name is invalid! "
130 f"Options: {constants.helical_parameters}")
132 # get unit from helical parameter name
133 if self.helpar_name in constants.hp_angular:
134 self.hp_unit = "Degrees"
135 else:
136 self.hp_unit = "Angstroms"
138 # resolve output
139 output_csv_path = Path(
140 self.io_dict['out']['output_csv_path']).resolve()
141 output_jpg_path = Path(
142 self.io_dict['out']['output_jpg_path']).resolve()
144 # Creating temporary folder
145 self.tmp_folder = fu.create_unique_dir(prefix="bimodality_")
146 fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log)
148 # read input
149 if self.io_dict['in']['input_zip_file'] is not None:
150 # Copy input_file_path1 to temporary folder
151 shutil.copy(self.io_dict['in']['input_zip_file'], self.tmp_folder)
152 # if zipfile is specified, extract to temporary folder
153 with zipfile.ZipFile(
154 self.io_dict['in']['input_zip_file'],
155 'r') as zip_ref:
156 zip_ref.extractall(self.tmp_folder)
157 else:
158 # copy input files to temporary folder
159 shutil.copy(
160 self.io_dict['in']['input_csv_file'],
161 self.tmp_folder)
162 # change directory to temporary folder
163 original_directory = os.getcwd()
164 os.chdir(self.tmp_folder)
165 data = load_data(Path(self.io_dict['in']['input_csv_file']).name)
167 means, variances, bics, weights = self.fit_to_model(data)
168 uninormal, binormal, insuf_ev = self.bayes_factor_criteria(
169 bics[0], bics[1])
171 if binormal:
172 maxm = np.argmax(means[1])
173 minm = np.argmin(means[1])
174 mean1 = means[1][minm]
175 var1 = variances[1][minm]
176 w1 = weights[1][minm]
177 mean2 = means[1][maxm]
178 var2 = variances[1][maxm]
179 w2 = weights[1][maxm]
180 bimodal = self.helguero_theorem(mean1, mean2, var1, var2)
181 else:
182 mean1 = means[0][0]
183 var1 = variances[0][0]
184 w1 = weights[0][0]
185 mean2, var2, w2 = np.nan, np.nan, 0
186 bimodal = False
187 info = dict(
188 binormal=binormal,
189 uninormal=uninormal,
190 insuf_ev=insuf_ev,
191 bimodal=bimodal,
192 mean1=mean1,
193 mean2=mean2,
194 var1=var1,
195 var2=var2,
196 w1=w1,
197 w2=w2)
199 # save tables
200 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path)
202 # make and save plot
203 data_size = len(data)
204 synth1 = np.random.normal(
205 loc=info['mean1'],
206 scale=np.sqrt(info['var1']),
207 size=int(data_size * info['w1']))
208 synth2 = np.random.normal(
209 loc=info['mean2'],
210 scale=np.sqrt(info['var2']),
211 size=int(data_size * info['w2']))
213 plt.figure()
214 alpha = 0.7
215 bins = 100
216 if binormal:
217 label1 = "Low State"
218 else:
219 label1 = "Single State"
220 out = plt.hist(
221 synth1, bins=bins, alpha=alpha, density=True, label=label1)
222 ylim = max(out[0])
223 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed")
224 if binormal:
225 out = plt.hist(
226 synth2, bins=bins, alpha=alpha, density=True, label="high state")
227 ylim = max(out[0])
228 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed")
229 plt.legend()
230 plt.ylabel("Density")
231 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})")
232 plt.title(f"Distribution of {self.helpar_name} states")
233 plt.savefig(output_jpg_path, format="jpg")
234 plt.close()
236 # change back to original directory
237 os.chdir(original_directory)
239 # Remove temporary file(s)
240 self.tmp_files.extend([
241 self.stage_io_dict.get("unique_dir"),
242 self.tmp_folder
243 ])
244 self.remove_tmp_files()
246 self.check_arguments(output_files_created=True, raise_exception=False)
248 return 0
250 def fit_to_model(self, data):
251 """
252 Fit data to Gaussian Mixture models.
253 Return dictionary with distribution data.
254 """
255 means = []
256 variances = []
257 bics = []
258 weights = []
259 for n_components in (1, 2):
260 gmm = GaussianMixture(
261 n_components=n_components,
262 max_iter=self.max_iter,
263 tol=self.tol)
264 gmm = gmm.fit(data)
265 m = gmm.means_.flatten()
266 v = gmm.covariances_.flatten()
267 b = gmm.bic(data)
268 w = gmm.weights_.flatten()
269 means.append(m)
270 variances.append(v)
271 bics.append(b)
272 weights.append(w)
273 return means, variances, bics, weights
275 def bayes_factor_criteria(self, bic1, bic2):
276 diff_bic = bic2 - bic1
277 # probability of a two-component model
278 p = 1 / (1 + np.exp(0.5*diff_bic))
279 if p == np.nan:
280 if bic1 == np.nan:
281 p = 1
282 elif bic2 == np.nan:
283 p = 0
285 uninormal = p < (self.confidence_level / 100)
286 binormal = p > (1 - (self.confidence_level / 100))
287 insuf_ev = True if (not uninormal and not binormal) else False
288 return uninormal, binormal, insuf_ev
290 def helguero_theorem(self, mean1, mean2, var1, var2):
291 r = var1 / var2
292 separation_factor = np.sqrt(
293 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5
294 ) / (
295 np.sqrt(r)*(1+np.sqrt(r))
296 )
297 bimodal = abs(mean2-mean1) > separation_factor * \
298 (np.sqrt(var1) + np.sqrt(var2))
299 return bimodal
302def dna_bimodality(
303 input_csv_file, output_csv_path, output_jpg_path,
304 input_zip_file: str = None, properties: dict = None, **kwargs) -> int:
305 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and
306 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method."""
308 return HelParBimodality(
309 input_csv_file=input_csv_file,
310 input_zip_file=input_zip_file,
311 output_csv_path=output_csv_path,
312 output_jpg_path=output_jpg_path,
313 properties=properties, **kwargs).launch()
316def main():
317 """Command line execution of this building block. Please check the command line documentation."""
318 parser = argparse.ArgumentParser(description='Determine binormality/bimodality from a helical parameter dataset.',
319 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
320 parser.add_argument('--config', required=False, help='Configuration file')
322 required_args = parser.add_argument_group('required arguments')
323 required_args.add_argument('--input_csv_file', required=True,
324 help='Path to csv file with data. Accepted formats: csv.')
325 parser.add_argument('--input_zip_file',
326 help='Path to zip file containing csv input files. Accepted formats: zip.')
327 required_args.add_argument('--output_csv_path', required=True,
328 help='Filename and/or path of output csv file.')
329 required_args.add_argument('--output_jpg_path', required=True,
330 help='Filename and/or path of output jpg file.')
332 args = parser.parse_args()
333 args.config = args.config or "{}"
334 properties = settings.ConfReader(config=args.config).get_prop_dic()
336 dna_bimodality(
337 input_csv_file=args.input_csv_file,
338 input_zip_file=args.input_zip_file,
339 output_csv_path=args.output_csv_path,
340 output_jpg_path=args.output_jpg_path,
341 properties=properties)
344if __name__ == '__main__':
345 main()