Coverage for biobb_dna / dna / dna_bimodality.py: 74%
139 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:05 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:05 +0000
1#!/usr/bin/env python3
3"""Module containing the HelParBimodality class and the command line interface."""
4import os
5import zipfile
6from typing import Optional
7from pathlib import Path
9import pandas as pd
10import numpy as np
11import matplotlib.pyplot as plt
12from sklearn.mixture import GaussianMixture # type: ignore
13from biobb_dna.utils import constants
14from biobb_common.generic.biobb_object import BiobbObject
15from biobb_common.tools.file_utils import launchlogger
16from biobb_dna.utils.loader import load_data
19class HelParBimodality(BiobbObject):
20 """
21 | biobb_dna HelParBimodality
22 | Determine binormality/bimodality from a helical parameter series dataset.
23 | Determine binormality/bimodality from a helical parameter series dataset.
25 Args:
26 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752).
27 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987).
28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579).
30 properties (dict):
31 * **helpar_name** (*str*) - (Optional) helical parameter name.
32 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage).
33 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm.
34 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm.
35 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
36 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
37 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.1
39 Examples:
40 This is a use example of how to use the building block from Python::
42 from biobb_dna.dna.dna_bimodality import dna_bimodality
44 prop = {
45 'max_iter': 500,
46 }
47 dna_bimodality(
48 input_csv_file='filename.csv',
49 input_zip_file='/path/to/input.zip',
50 output_csv_path='/path/to/output.csv',
51 output_jpg_path='/path/to/output.jpg',
52 properties=prop)
53 Info:
54 * wrapped_software:
55 * name: In house
56 * license: Apache-2.0
57 * ontology:
58 * name: EDAM
59 * schema: http://edamontology.org/EDAM.owl
61 """
63 def __init__(self, input_csv_file, output_csv_path,
64 output_jpg_path, input_zip_file=None,
65 properties=None, **kwargs) -> None:
66 properties = properties or {}
68 # Call parent class constructor
69 super().__init__(properties)
70 self.locals_var_dict = locals().copy()
72 # Input/Output files
73 self.io_dict = {
74 'in': {
75 'input_csv_file': input_csv_file,
76 'input_zip_file': input_zip_file
77 },
78 'out': {
79 'output_csv_path': output_csv_path,
80 'output_jpg_path': output_jpg_path
81 }
82 }
84 # Properties specific for BB
85 self.confidence_level = properties.get(
86 "confidence_level", 5.0)
87 self.max_iter = properties.get(
88 "max_iter", 400)
89 self.tol = properties.get(
90 "tol", 1e-5)
91 self.helpar_name = properties.get("helpar_name", None)
92 self.properties = properties
94 # Check the properties
95 self.check_properties(properties)
96 self.check_arguments()
98 @launchlogger
99 def launch(self) -> int:
100 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object."""
102 # Setup Biobb
103 if self.check_restart():
104 return 0
105 self.stage_files()
107 # get helical parameter from filename if not specified
108 if self.helpar_name is None:
109 for hp in constants.helical_parameters:
110 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None:
111 condition_2 = (
112 hp.lower() in Path(self.stage_io_dict['in']['input_zip_file']).name.lower())
113 else:
114 condition_2 = False
115 condition_1 = hp.lower() in Path(
116 self.stage_io_dict['in']['input_csv_file']).name.lower()
117 if (condition_1 or condition_2):
118 self.helpar_name = hp
119 if self.helpar_name is None:
120 raise ValueError(
121 "Helical parameter name can't be inferred from file, "
122 "so it must be specified!")
123 else:
124 if self.helpar_name not in constants.helical_parameters:
125 raise ValueError(
126 "Helical parameter name is invalid! "
127 f"Options: {constants.helical_parameters}")
129 # get unit from helical parameter name
130 if self.helpar_name in constants.hp_angular:
131 self.hp_unit = "Degrees"
132 else:
133 self.hp_unit = "Angstroms"
135 # resolve output
136 output_csv_path = Path(
137 self.stage_io_dict['out']['output_csv_path']).resolve()
138 output_jpg_path = Path(
139 self.stage_io_dict['out']['output_jpg_path']).resolve()
141 # change directory to temporary folder
142 original_directory = os.getcwd()
143 os.chdir(self.stage_io_dict.get("unique_dir", ""))
145 # read input
146 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None:
147 # if zipfile is specified, extract to temporary folder
148 with zipfile.ZipFile(
149 self.stage_io_dict['in']['input_zip_file'],
150 'r') as zip_ref:
151 zip_ref.extractall(self.stage_io_dict.get("unique_dir", ""))
153 data = load_data(Path(self.stage_io_dict['in']['input_csv_file']).name)
155 means, variances, bics, weights = self.fit_to_model(data)
156 uninormal, binormal, insuf_ev = self.bayes_factor_criteria(
157 bics[0], bics[1])
159 if binormal:
160 maxm = np.argmax(means[1])
161 minm = np.argmin(means[1])
162 mean1 = means[1][minm]
163 var1 = variances[1][minm]
164 w1 = weights[1][minm]
165 mean2 = means[1][maxm]
166 var2 = variances[1][maxm]
167 w2 = weights[1][maxm]
168 bimodal = self.helguero_theorem(mean1, mean2, var1, var2)
169 else:
170 mean1 = means[0][0]
171 var1 = variances[0][0]
172 w1 = weights[0][0]
173 mean2, var2, w2 = np.nan, np.nan, 0
174 bimodal = False
175 info = dict(
176 binormal=binormal,
177 uninormal=uninormal,
178 insuf_ev=insuf_ev,
179 bimodal=bimodal,
180 mean1=mean1,
181 mean2=mean2,
182 var1=var1,
183 var2=var2,
184 w1=w1,
185 w2=w2)
187 # save tables
188 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path)
190 # make and save plot
191 data_size = len(data)
192 synth1 = np.random.normal(
193 loc=info['mean1'],
194 scale=np.sqrt(info['var1']),
195 size=int(data_size * info['w1']))
196 synth2 = np.random.normal(
197 loc=info['mean2'],
198 scale=np.sqrt(info['var2']),
199 size=int(data_size * info['w2']))
201 plt.figure()
202 alpha = 0.7
203 bins = 100
204 if binormal:
205 label1 = "Low State"
206 else:
207 label1 = "Single State"
208 out = plt.hist(
209 synth1, bins=bins, alpha=alpha, density=True, label=label1)
210 ylim = max(out[0]) # type: ignore
211 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed")
212 if binormal:
213 out = plt.hist(
214 synth2, bins=bins, alpha=alpha, density=True, label="high state")
215 ylim = max(out[0]) # type: ignore
216 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed")
217 plt.legend()
218 plt.ylabel("Density")
219 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})")
220 plt.title(f"Distribution of {self.helpar_name} states")
221 plt.savefig(output_jpg_path, format="jpg")
222 plt.close()
224 # change back to original directory
225 os.chdir(original_directory)
227 # Copy files to host
228 self.copy_to_host()
230 # Remove temporary file(s)
231 self.remove_tmp_files()
233 self.check_arguments(output_files_created=True, raise_exception=False)
235 return 0
237 def fit_to_model(self, data):
238 """
239 Fit data to Gaussian Mixture models.
240 Return dictionary with distribution data.
241 """
242 means = []
243 variances = []
244 bics = []
245 weights = []
246 for n_components in (1, 2):
247 gmm = GaussianMixture(
248 n_components=n_components,
249 max_iter=self.max_iter,
250 tol=self.tol)
251 gmm = gmm.fit(data)
252 m = gmm.means_.flatten() # type: ignore
253 v = gmm.covariances_.flatten() # type: ignore
254 b = gmm.bic(data)
255 w = gmm.weights_.flatten() # type: ignore
256 means.append(m)
257 variances.append(v)
258 bics.append(b)
259 weights.append(w)
260 return means, variances, bics, weights
262 def bayes_factor_criteria(self, bic1, bic2):
263 diff_bic = bic2 - bic1
264 # probability of a two-component model
265 p = 1 / (1 + np.exp(0.5*diff_bic))
266 if p == np.nan:
267 if bic1 == np.nan:
268 p = 1
269 elif bic2 == np.nan:
270 p = 0
272 uninormal = p < (self.confidence_level / 100)
273 binormal = p > (1 - (self.confidence_level / 100))
274 insuf_ev = True if (not uninormal and not binormal) else False
275 return uninormal, binormal, insuf_ev
277 def helguero_theorem(self, mean1, mean2, var1, var2):
278 r = var1 / var2
279 separation_factor = np.sqrt(
280 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5
281 ) / (
282 np.sqrt(r)*(1+np.sqrt(r))
283 )
284 bimodal = abs(mean2-mean1) > separation_factor * \
285 (np.sqrt(var1) + np.sqrt(var2))
286 return bimodal
289def dna_bimodality(
290 input_csv_file, output_csv_path, output_jpg_path,
291 input_zip_file: Optional[str] = None, properties: Optional[dict] = None, **kwargs) -> int:
292 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and
293 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method."""
294 return HelParBimodality(**dict(locals())).launch()
297dna_bimodality.__doc__ = HelParBimodality.__doc__
298main = HelParBimodality.get_main(dna_bimodality, "Determine binormality/bimodality from a helical parameter dataset.")
300if __name__ == '__main__':
301 main()