Coverage for biobb_dna / backbone / bipopulations.py: 90%
87 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:05 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:05 +0000
1#!/usr/bin/env python3
2"""Module containing the BIPopulations class and the command line interface."""
4from typing import Optional
6import matplotlib.pyplot as plt
7import pandas as pd
8from biobb_common.generic.biobb_object import BiobbObject
9from biobb_common.tools.file_utils import launchlogger
10from numpy import nan
12from biobb_dna.utils.common import _from_string_to_list
13from biobb_dna.utils.loader import read_series
14from biobb_dna.utils.transform import inverse_complement
17class BIPopulations(BiobbObject):
18 """
19 | biobb_dna BIPopulations
20 | Calculate BI/BII populations from epsilon and zeta parameters.
21 | Calculate BI/BII populations from epsilon and zeta parameters.
23 Args:
24 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330).
25 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330).
26 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330).
27 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330).
28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).
30 properties (dict):
31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
35 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
37 Examples:
38 This is a use example of how to use the building block from Python::
40 from biobb_dna.backbone.bipopulations import bipopulations
42 prop = {
43 'sequence': 'GCAT',
44 }
45 bipopulations(
46 input_epsilC_path='/path/to/epsilC.ser',
47 input_epsilW_path='/path/to/epsilW.ser',
48 input_zetaC_path='/path/to/zetaC.ser',
49 input_zetaW_path='/path/to/zetaW.ser',
50 output_csv_path='/path/to/table/output.csv',
51 output_jpg_path='/path/to/table/output.jpg',
52 properties=prop)
53 Info:
54 * wrapped_software:
55 * name: In house
56 * license: Apache-2.0
57 * ontology:
58 * name: EDAM
59 * schema: http://edamontology.org/EDAM.owl
61 """
63 def __init__(
64 self,
65 input_epsilC_path,
66 input_epsilW_path,
67 input_zetaC_path,
68 input_zetaW_path,
69 output_csv_path,
70 output_jpg_path,
71 properties=None,
72 **kwargs,
73 ) -> None:
74 properties = properties or {}
76 # Call parent class constructor
77 super().__init__(properties)
78 self.locals_var_dict = locals().copy()
80 # Input/Output files
81 self.io_dict = {
82 "in": {
83 "input_epsilC_path": input_epsilC_path,
84 "input_epsilW_path": input_epsilW_path,
85 "input_zetaC_path": input_zetaC_path,
86 "input_zetaW_path": input_zetaW_path,
87 },
88 "out": {
89 "output_csv_path": output_csv_path,
90 "output_jpg_path": output_jpg_path,
91 },
92 }
94 self.properties = properties
95 self.sequence = properties.get("sequence")
96 # self.seqpos = properties.get("seqpos", None)
97 self.seqpos = [
98 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
99 ]
100 print("seqpos::::::::")
101 print(self.seqpos)
102 # Check the properties
103 self.check_properties(properties)
104 self.check_arguments()
106 @launchlogger
107 def launch(self) -> int:
108 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object."""
110 # Setup Biobb
111 if self.check_restart():
112 return 0
113 self.stage_files()
115 # check sequence
116 if self.sequence is None or len(self.sequence) < 2:
117 raise ValueError("sequence is null or too short!")
119 # check seqpos
120 if self.seqpos:
121 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):
122 raise ValueError(
123 f"seqpos values must be between 1 and {len(self.sequence) - 2}"
124 )
125 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
126 raise ValueError("seqpos must be a list of at least two integers")
127 else:
128 self.seqpos = None # type: ignore
130 # read input files
131 epsilC = read_series(
132 self.stage_io_dict["in"]["input_epsilC_path"], usecols=self.seqpos
133 )
134 epsilW = read_series(
135 self.stage_io_dict["in"]["input_epsilW_path"], usecols=self.seqpos
136 )
137 zetaC = read_series(
138 self.stage_io_dict["in"]["input_zetaC_path"], usecols=self.seqpos
139 )
140 zetaW = read_series(
141 self.stage_io_dict["in"]["input_zetaW_path"], usecols=self.seqpos
142 )
144 # calculate difference between epsil and zeta parameters
145 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence))
146 diff_epsil_zeta = self.get_angles_difference(epsilC, zetaC, epsilW, zetaW)
148 # calculate BI population
149 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta)
150 BII = 100 - BI
152 # save table
153 Bpopulations_df = pd.DataFrame(
154 {"Nucleotide": xlabels, "BI population": BI, "BII population": BII}
155 )
156 Bpopulations_df.to_csv(
157 self.stage_io_dict["out"]["output_csv_path"], index=False
158 )
160 # save plot
161 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
162 axs.bar(range(len(xlabels)), BI, label="BI")
163 axs.bar(range(len(xlabels)), BII, bottom=BI, label="BII")
164 # empty bar to divide both sequences
165 axs.bar([len(BI) // 2], [100], color="white", label=None)
166 axs.legend()
167 axs.set_xticks(range(len(xlabels)))
168 axs.set_xticklabels(xlabels, rotation=90)
169 axs.set_xlabel("Nucleotide Sequence")
170 axs.set_ylabel("BI/BII Population (%)")
171 axs.set_title("Nucleotide parameter: BI/BII Population")
172 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
173 plt.close()
175 # Copy files to host
176 self.copy_to_host()
178 # Remove temporary file(s)
179 self.remove_tmp_files()
181 self.check_arguments(output_files_created=True, raise_exception=False)
183 return 0
185 def get_xlabels(self, strand1, strand2):
186 # get list of tetramers, except first and last two bases
187 labelsW = list(strand1)
188 labelsW[0] = f"{labelsW[0]}5'"
189 labelsW[-1] = f"{labelsW[-1]}3'"
190 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))]
191 labelsC = list(strand2)[::-1]
192 labelsC[0] = f"{labelsC[0]}5'"
193 labelsC[-1] = f"{labelsC[-1]}3'"
194 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]
196 if self.seqpos is not None:
197 labelsC = [labelsC[i] for i in self.seqpos]
198 labelsW = [labelsW[i] for i in self.seqpos]
199 xlabels = labelsW + ["-"] + labelsC
200 return xlabels
202 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW):
203 # concatenate zeta and epsil arrays
204 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW)))
205 zeta = pd.concat([zetaW, separator_df, zetaC[zetaC.columns[::-1]]], axis=1)
206 epsil = pd.concat([epsilW, separator_df, epsilC[epsilC.columns[::-1]]], axis=1)
208 # difference between epsilon and zeta coordinates
209 diff_epsil_zeta = epsil - zeta
210 return diff_epsil_zeta
213def bipopulations(
214 input_epsilC_path: str,
215 input_epsilW_path: str,
216 input_zetaC_path: str,
217 input_zetaW_path: str,
218 output_csv_path: str,
219 output_jpg_path: str,
220 properties: Optional[dict] = None,
221 **kwargs,
222) -> int:
223 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and
224 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method."""
225 return BIPopulations(**dict(locals())).launch()
228bipopulations.__doc__ = BIPopulations.__doc__
229main = BIPopulations.get_main(bipopulations, "Calculate BI/BII populations.")
231if __name__ == "__main__":
232 main()