Coverage for biobb_dna/dna/dna_bimodality.py: 68%
152 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
1#!/usr/bin/env python3
3"""Module containing the HelParBimodality class and the command line interface."""
4import os
5import zipfile
6import argparse
7from typing import Optional
8from pathlib import Path
10import pandas as pd
11import numpy as np
12import matplotlib.pyplot as plt
13from sklearn.mixture import GaussianMixture # type: ignore
14from biobb_dna.utils import constants
15from biobb_common.generic.biobb_object import BiobbObject
16from biobb_common.configuration import settings
17from biobb_common.tools.file_utils import launchlogger
18from biobb_dna.utils.loader import load_data
21class HelParBimodality(BiobbObject):
22 """
23 | biobb_dna HelParBimodality
24 | Determine binormality/bimodality from a helical parameter series dataset.
25 | Determine binormality/bimodality from a helical parameter series dataset.
27 Args:
28 input_csv_file (str): Path to .csv file with helical parameter series. If `input_zip_file` is passed, this should be just the filename of the .csv file inside .zip. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/series_shift_AT.csv>`_. Accepted formats: csv (edam:format_3752).
29 input_zip_file (str) (Optional): .zip file containing the `input_csv_file` .csv file. File type: input. Accepted formats: zip (edam:format_3987).
30 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.csv>`_. Accepted formats: csv (edam:format_3752).
31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/AT_shift_bimod.jpg>`_. Accepted formats: jpg (edam:format_3579).
32 properties (dict):
33 * **helpar_name** (*str*) - (Optional) helical parameter name.
34 * **confidence_level** (*float*) - (5.0) Confidence level for Byes Factor test (in percentage).
35 * **max_iter** (*int*) - (400) Number of maximum iterations for EM algorithm.
36 * **tol** (*float*) - (1e-5) Tolerance value for EM algorithm.
37 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
38 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
39 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.1
41 Examples:
42 This is a use example of how to use the building block from Python::
44 from biobb_dna.dna.dna_bimodality import dna_bimodality
46 prop = {
47 'max_iter': 500,
48 }
49 dna_bimodality(
50 input_csv_file='filename.csv',
51 input_zip_file='/path/to/input.zip',
52 output_csv_path='/path/to/output.csv',
53 output_jpg_path='/path/to/output.jpg',
54 properties=prop)
55 Info:
56 * wrapped_software:
57 * name: In house
58 * license: Apache-2.0
59 * ontology:
60 * name: EDAM
61 * schema: http://edamontology.org/EDAM.owl
63 """
65 def __init__(self, input_csv_file, output_csv_path,
66 output_jpg_path, input_zip_file=None,
67 properties=None, **kwargs) -> None:
68 properties = properties or {}
70 # Call parent class constructor
71 super().__init__(properties)
72 self.locals_var_dict = locals().copy()
74 # Input/Output files
75 self.io_dict = {
76 'in': {
77 'input_csv_file': input_csv_file,
78 'input_zip_file': input_zip_file
79 },
80 'out': {
81 'output_csv_path': output_csv_path,
82 'output_jpg_path': output_jpg_path
83 }
84 }
86 # Properties specific for BB
87 self.confidence_level = properties.get(
88 "confidence_level", 5.0)
89 self.max_iter = properties.get(
90 "max_iter", 400)
91 self.tol = properties.get(
92 "tol", 1e-5)
93 self.helpar_name = properties.get("helpar_name", None)
94 self.properties = properties
96 # Check the properties
97 self.check_properties(properties)
98 self.check_arguments()
100 @launchlogger
101 def launch(self) -> int:
102 """Execute the :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` object."""
104 # Setup Biobb
105 if self.check_restart():
106 return 0
107 self.stage_files()
109 # get helical parameter from filename if not specified
110 if self.helpar_name is None:
111 for hp in constants.helical_parameters:
112 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None:
113 condition_2 = (
114 hp.lower() in Path(self.stage_io_dict['in']['input_zip_file']).name.lower())
115 else:
116 condition_2 = False
117 condition_1 = hp.lower() in Path(
118 self.stage_io_dict['in']['input_csv_file']).name.lower()
119 if (condition_1 or condition_2):
120 self.helpar_name = hp
121 if self.helpar_name is None:
122 raise ValueError(
123 "Helical parameter name can't be inferred from file, "
124 "so it must be specified!")
125 else:
126 if self.helpar_name not in constants.helical_parameters:
127 raise ValueError(
128 "Helical parameter name is invalid! "
129 f"Options: {constants.helical_parameters}")
131 # get unit from helical parameter name
132 if self.helpar_name in constants.hp_angular:
133 self.hp_unit = "Degrees"
134 else:
135 self.hp_unit = "Angstroms"
137 # resolve output
138 output_csv_path = Path(
139 self.stage_io_dict['out']['output_csv_path']).resolve()
140 output_jpg_path = Path(
141 self.stage_io_dict['out']['output_jpg_path']).resolve()
143 # change directory to temporary folder
144 original_directory = os.getcwd()
145 os.chdir(self.stage_io_dict.get("unique_dir", ""))
147 # read input
148 if self.stage_io_dict.get("in", {}).get("input_zip_file") is not None:
149 # if zipfile is specified, extract to temporary folder
150 with zipfile.ZipFile(
151 self.stage_io_dict['in']['input_zip_file'],
152 'r') as zip_ref:
153 zip_ref.extractall(self.stage_io_dict.get("unique_dir", ""))
155 data = load_data(Path(self.stage_io_dict['in']['input_csv_file']).name)
157 means, variances, bics, weights = self.fit_to_model(data)
158 uninormal, binormal, insuf_ev = self.bayes_factor_criteria(
159 bics[0], bics[1])
161 if binormal:
162 maxm = np.argmax(means[1])
163 minm = np.argmin(means[1])
164 mean1 = means[1][minm]
165 var1 = variances[1][minm]
166 w1 = weights[1][minm]
167 mean2 = means[1][maxm]
168 var2 = variances[1][maxm]
169 w2 = weights[1][maxm]
170 bimodal = self.helguero_theorem(mean1, mean2, var1, var2)
171 else:
172 mean1 = means[0][0]
173 var1 = variances[0][0]
174 w1 = weights[0][0]
175 mean2, var2, w2 = np.nan, np.nan, 0
176 bimodal = False
177 info = dict(
178 binormal=binormal,
179 uninormal=uninormal,
180 insuf_ev=insuf_ev,
181 bimodal=bimodal,
182 mean1=mean1,
183 mean2=mean2,
184 var1=var1,
185 var2=var2,
186 w1=w1,
187 w2=w2)
189 # save tables
190 pd.DataFrame(info, index=data.columns).to_csv(output_csv_path)
192 # make and save plot
193 data_size = len(data)
194 synth1 = np.random.normal(
195 loc=info['mean1'],
196 scale=np.sqrt(info['var1']),
197 size=int(data_size * info['w1']))
198 synth2 = np.random.normal(
199 loc=info['mean2'],
200 scale=np.sqrt(info['var2']),
201 size=int(data_size * info['w2']))
203 plt.figure()
204 alpha = 0.7
205 bins = 100
206 if binormal:
207 label1 = "Low State"
208 else:
209 label1 = "Single State"
210 out = plt.hist(
211 synth1, bins=bins, alpha=alpha, density=True, label=label1)
212 ylim = max(out[0]) # type: ignore
213 plt.vlines(info['mean1'], 0, ylim, colors="r", linestyles="dashed")
214 if binormal:
215 out = plt.hist(
216 synth2, bins=bins, alpha=alpha, density=True, label="high state")
217 ylim = max(out[0]) # type: ignore
218 plt.vlines(info['mean2'], 0, ylim, colors="r", linestyles="dashed")
219 plt.legend()
220 plt.ylabel("Density")
221 plt.xlabel(f"{self.helpar_name.capitalize()} ({self.hp_unit})")
222 plt.title(f"Distribution of {self.helpar_name} states")
223 plt.savefig(output_jpg_path, format="jpg")
224 plt.close()
226 # change back to original directory
227 os.chdir(original_directory)
229 # Copy files to host
230 self.copy_to_host()
232 # Remove temporary file(s)
233 # self.tmp_files.extend([
234 # self.stage_io_dict.get("unique_dir", "")
235 # ])
236 self.remove_tmp_files()
238 self.check_arguments(output_files_created=True, raise_exception=False)
240 return 0
242 def fit_to_model(self, data):
243 """
244 Fit data to Gaussian Mixture models.
245 Return dictionary with distribution data.
246 """
247 means = []
248 variances = []
249 bics = []
250 weights = []
251 for n_components in (1, 2):
252 gmm = GaussianMixture(
253 n_components=n_components,
254 max_iter=self.max_iter,
255 tol=self.tol)
256 gmm = gmm.fit(data)
257 m = gmm.means_.flatten() # type: ignore
258 v = gmm.covariances_.flatten() # type: ignore
259 b = gmm.bic(data)
260 w = gmm.weights_.flatten() # type: ignore
261 means.append(m)
262 variances.append(v)
263 bics.append(b)
264 weights.append(w)
265 return means, variances, bics, weights
267 def bayes_factor_criteria(self, bic1, bic2):
268 diff_bic = bic2 - bic1
269 # probability of a two-component model
270 p = 1 / (1 + np.exp(0.5*diff_bic))
271 if p == np.nan:
272 if bic1 == np.nan:
273 p = 1
274 elif bic2 == np.nan:
275 p = 0
277 uninormal = p < (self.confidence_level / 100)
278 binormal = p > (1 - (self.confidence_level / 100))
279 insuf_ev = True if (not uninormal and not binormal) else False
280 return uninormal, binormal, insuf_ev
282 def helguero_theorem(self, mean1, mean2, var1, var2):
283 r = var1 / var2
284 separation_factor = np.sqrt(
285 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5
286 ) / (
287 np.sqrt(r)*(1+np.sqrt(r))
288 )
289 bimodal = abs(mean2-mean1) > separation_factor * \
290 (np.sqrt(var1) + np.sqrt(var2))
291 return bimodal
294def dna_bimodality(
295 input_csv_file, output_csv_path, output_jpg_path,
296 input_zip_file: Optional[str] = None, properties: Optional[dict] = None, **kwargs) -> int:
297 """Create :class:`HelParBimodality <dna.dna_bimodality.HelParBimodality>` class and
298 execute the :meth:`launch() <dna.dna_bimodality.HelParBimodality.launch>` method."""
300 return HelParBimodality(
301 input_csv_file=input_csv_file,
302 input_zip_file=input_zip_file,
303 output_csv_path=output_csv_path,
304 output_jpg_path=output_jpg_path,
305 properties=properties, **kwargs).launch()
307 dna_bimodality.__doc__ = HelParBimodality.__doc__
310def main():
311 """Command line execution of this building block. Please check the command line documentation."""
312 parser = argparse.ArgumentParser(description='Determine binormality/bimodality from a helical parameter dataset.',
313 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
314 parser.add_argument('--config', required=False, help='Configuration file')
316 required_args = parser.add_argument_group('required arguments')
317 required_args.add_argument('--input_csv_file', required=True,
318 help='Path to csv file with data. Accepted formats: csv.')
319 parser.add_argument('--input_zip_file',
320 help='Path to zip file containing csv input files. Accepted formats: zip.')
321 required_args.add_argument('--output_csv_path', required=True,
322 help='Filename and/or path of output csv file.')
323 required_args.add_argument('--output_jpg_path', required=True,
324 help='Filename and/or path of output jpg file.')
326 args = parser.parse_args()
327 args.config = args.config or "{}"
328 properties = settings.ConfReader(config=args.config).get_prop_dic()
330 dna_bimodality(
331 input_csv_file=args.input_csv_file,
332 input_zip_file=args.input_zip_file,
333 output_csv_path=args.output_csv_path,
334 output_jpg_path=args.output_jpg_path,
335 properties=properties)
338if __name__ == '__main__':
339 main()