Coverage for biobb_dna/backbone/bipopulations.py: 77%
102 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
1#!/usr/bin/env python3
2"""Module containing the BIPopulations class and the command line interface."""
4import argparse
5from typing import Optional
7import matplotlib.pyplot as plt
8import pandas as pd
9from biobb_common.configuration import settings
10from biobb_common.generic.biobb_object import BiobbObject
11from biobb_common.tools.file_utils import launchlogger
12from numpy import nan
14from biobb_dna.utils.common import _from_string_to_list
15from biobb_dna.utils.loader import read_series
16from biobb_dna.utils.transform import inverse_complement
19class BIPopulations(BiobbObject):
20 """
21 | biobb_dna BIPopulations
22 | Calculate BI/BII populations from epsilon and zeta parameters.
23 | Calculate BI/BII populations from epsilon and zeta parameters.
25 Args:
26 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330).
27 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330).
28 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330).
29 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330).
30 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752).
31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).
32 properties (dict):
33 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
34 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
35 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
36 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
37 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
39 Examples:
40 This is a use example of how to use the building block from Python::
42 from biobb_dna.backbone.bipopulations import bipopulations
44 prop = {
45 'sequence': 'GCAT',
46 }
47 bipopulations(
48 input_epsilC_path='/path/to/epsilC.ser',
49 input_epsilW_path='/path/to/epsilW.ser',
50 input_zetaC_path='/path/to/zetaC.ser',
51 input_zetaW_path='/path/to/zetaW.ser',
52 output_csv_path='/path/to/table/output.csv',
53 output_jpg_path='/path/to/table/output.jpg',
54 properties=prop)
55 Info:
56 * wrapped_software:
57 * name: In house
58 * license: Apache-2.0
59 * ontology:
60 * name: EDAM
61 * schema: http://edamontology.org/EDAM.owl
63 """
65 def __init__(
66 self,
67 input_epsilC_path,
68 input_epsilW_path,
69 input_zetaC_path,
70 input_zetaW_path,
71 output_csv_path,
72 output_jpg_path,
73 properties=None,
74 **kwargs,
75 ) -> None:
76 properties = properties or {}
78 # Call parent class constructor
79 super().__init__(properties)
80 self.locals_var_dict = locals().copy()
82 # Input/Output files
83 self.io_dict = {
84 "in": {
85 "input_epsilC_path": input_epsilC_path,
86 "input_epsilW_path": input_epsilW_path,
87 "input_zetaC_path": input_zetaC_path,
88 "input_zetaW_path": input_zetaW_path,
89 },
90 "out": {
91 "output_csv_path": output_csv_path,
92 "output_jpg_path": output_jpg_path,
93 },
94 }
96 self.properties = properties
97 self.sequence = properties.get("sequence")
98 # self.seqpos = properties.get("seqpos", None)
99 self.seqpos = [
100 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
101 ]
102 print("seqpos::::::::")
103 print(self.seqpos)
104 # Check the properties
105 self.check_properties(properties)
106 self.check_arguments()
108 @launchlogger
109 def launch(self) -> int:
110 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object."""
112 # Setup Biobb
113 if self.check_restart():
114 return 0
115 self.stage_files()
117 # check sequence
118 if self.sequence is None or len(self.sequence) < 2:
119 raise ValueError("sequence is null or too short!")
121 # check seqpos
122 if self.seqpos:
123 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):
124 raise ValueError(
125 f"seqpos values must be between 1 and {len(self.sequence) - 2}"
126 )
127 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
128 raise ValueError("seqpos must be a list of at least two integers")
129 else:
130 self.seqpos = None # type: ignore
132 # read input files
133 epsilC = read_series(
134 self.stage_io_dict["in"]["input_epsilC_path"], usecols=self.seqpos
135 )
136 epsilW = read_series(
137 self.stage_io_dict["in"]["input_epsilW_path"], usecols=self.seqpos
138 )
139 zetaC = read_series(
140 self.stage_io_dict["in"]["input_zetaC_path"], usecols=self.seqpos
141 )
142 zetaW = read_series(
143 self.stage_io_dict["in"]["input_zetaW_path"], usecols=self.seqpos
144 )
146 # calculate difference between epsil and zeta parameters
147 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence))
148 diff_epsil_zeta = self.get_angles_difference(epsilC, zetaC, epsilW, zetaW)
150 # calculate BI population
151 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta)
152 BII = 100 - BI
154 # save table
155 Bpopulations_df = pd.DataFrame(
156 {"Nucleotide": xlabels, "BI population": BI, "BII population": BII}
157 )
158 Bpopulations_df.to_csv(
159 self.stage_io_dict["out"]["output_csv_path"], index=False
160 )
162 # save plot
163 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
164 axs.bar(range(len(xlabels)), BI, label="BI")
165 axs.bar(range(len(xlabels)), BII, bottom=BI, label="BII")
166 # empty bar to divide both sequences
167 axs.bar([len(BI) // 2], [100], color="white", label=None)
168 axs.legend()
169 axs.set_xticks(range(len(xlabels)))
170 axs.set_xticklabels(xlabels, rotation=90)
171 axs.set_xlabel("Nucleotide Sequence")
172 axs.set_ylabel("BI/BII Population (%)")
173 axs.set_title("Nucleotide parameter: BI/BII Population")
174 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
175 plt.close()
177 # Copy files to host
178 self.copy_to_host()
180 # Remove temporary file(s)
181 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")])
182 self.remove_tmp_files()
184 self.check_arguments(output_files_created=True, raise_exception=False)
186 return 0
188 def get_xlabels(self, strand1, strand2):
189 # get list of tetramers, except first and last two bases
190 labelsW = list(strand1)
191 labelsW[0] = f"{labelsW[0]}5'"
192 labelsW[-1] = f"{labelsW[-1]}3'"
193 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))]
194 labelsC = list(strand2)[::-1]
195 labelsC[0] = f"{labelsC[0]}5'"
196 labelsC[-1] = f"{labelsC[-1]}3'"
197 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]
199 if self.seqpos is not None:
200 labelsC = [labelsC[i] for i in self.seqpos]
201 labelsW = [labelsW[i] for i in self.seqpos]
202 xlabels = labelsW + ["-"] + labelsC
203 return xlabels
205 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW):
206 # concatenate zeta and epsil arrays
207 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW)))
208 zeta = pd.concat([zetaW, separator_df, zetaC[zetaC.columns[::-1]]], axis=1)
209 epsil = pd.concat([epsilW, separator_df, epsilC[epsilC.columns[::-1]]], axis=1)
211 # difference between epsilon and zeta coordinates
212 diff_epsil_zeta = epsil - zeta
213 return diff_epsil_zeta
216def bipopulations(
217 input_epsilC_path: str,
218 input_epsilW_path: str,
219 input_zetaC_path: str,
220 input_zetaW_path: str,
221 output_csv_path: str,
222 output_jpg_path: str,
223 properties: Optional[dict] = None,
224 **kwargs,
225) -> int:
226 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and
227 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method."""
229 return BIPopulations(
230 input_epsilC_path=input_epsilC_path,
231 input_epsilW_path=input_epsilW_path,
232 input_zetaC_path=input_zetaC_path,
233 input_zetaW_path=input_zetaW_path,
234 output_csv_path=output_csv_path,
235 output_jpg_path=output_jpg_path,
236 properties=properties,
237 **kwargs,
238 ).launch()
240 bipopulations.__doc__ = BIPopulations.__doc__
243def main():
244 """Command line execution of this building block. Please check the command line documentation."""
245 parser = argparse.ArgumentParser(
246 description="Calculate BI/BII populations.",
247 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999),
248 )
249 parser.add_argument("--config", required=False, help="Configuration file")
251 required_args = parser.add_argument_group("required arguments")
252 required_args.add_argument(
253 "--input_epsilC_path",
254 required=True,
255 help="Helical parameter <epsilC> input ser file path. Accepted formats: ser.",
256 )
257 required_args.add_argument(
258 "--input_epsilW_path",
259 required=True,
260 help="Helical parameter <epsilW> input ser file path. Accepted formats: ser.",
261 )
262 required_args.add_argument(
263 "--input_zetaC_path",
264 required=True,
265 help="Helical parameter <zetaC> input ser file path. Accepted formats: ser.",
266 )
267 required_args.add_argument(
268 "--input_zetaW_path",
269 required=True,
270 help="Helical parameter <zetaW> input ser file path. Accepted formats: ser.",
271 )
272 required_args.add_argument(
273 "--output_csv_path",
274 required=True,
275 help="Path to output csv file. Accepted formats: csv.",
276 )
277 required_args.add_argument(
278 "--output_jpg_path",
279 required=True,
280 help="Path to output jpg file. Accepted formats: jpg.",
281 )
283 args = parser.parse_args()
284 args.config = args.config or "{}"
285 properties = settings.ConfReader(config=args.config).get_prop_dic()
287 bipopulations(
288 input_epsilC_path=args.input_epsilC_path,
289 input_epsilW_path=args.input_epsilW_path,
290 input_zetaC_path=args.input_zetaC_path,
291 input_zetaW_path=args.input_zetaW_path,
292 output_csv_path=args.output_csv_path,
293 output_jpg_path=args.output_jpg_path,
294 properties=properties,
295 )
298if __name__ == "__main__":
299 main()