Coverage for biobb_dna/backbone/bipopulations.py: 79%
105 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:06 +0000
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-07 09:06 +0000
1#!/usr/bin/env python3
2"""Module containing the BIPopulations class and the command line interface."""
4import shutil
5import argparse
7import matplotlib.pyplot as plt
8import pandas as pd
9from numpy import nan
10from biobb_dna.utils.loader import read_series
11from biobb_dna.utils.transform import inverse_complement
12from biobb_common.generic.biobb_object import BiobbObject
13from biobb_common.tools.file_utils import launchlogger
14from biobb_common.tools import file_utils as fu
15from biobb_common.configuration import settings
18class BIPopulations(BiobbObject):
19 """
20 | biobb_dna BIPopulations
21 | Calculate BI/BII populations from epsilon and zeta parameters.
23 Args:
24 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330).
25 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330).
26 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330).
27 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330).
28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).
30 properties (dict):
31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
36 Examples:
37 This is a use example of how to use the building block from Python::
39 from biobb_dna.backbone.bipopulations import bipopulations
41 prop = {
42 'sequence': 'GCAT',
43 }
44 bipopulations(
45 input_epsilC_path='/path/to/epsilC.ser',
46 input_epsilW_path='/path/to/epsilW.ser',
47 input_zetaC_path='/path/to/zetaC.ser',
48 input_zetaW_path='/path/to/zetaW.ser',
49 output_csv_path='/path/to/table/output.csv',
50 output_jpg_path='/path/to/table/output.jpg',
51 properties=prop)
52 Info:
53 * wrapped_software:
54 * name: In house
55 * license: Apache-2.0
56 * ontology:
57 * name: EDAM
58 * schema: http://edamontology.org/EDAM.owl
60 """
62 def __init__(self, input_epsilC_path, input_epsilW_path,
63 input_zetaC_path, input_zetaW_path,
64 output_csv_path, output_jpg_path,
65 properties=None, **kwargs) -> None:
66 properties = properties or {}
68 # Call parent class constructor
69 super().__init__(properties)
70 self.locals_var_dict = locals().copy()
72 # Input/Output files
73 self.io_dict = {
74 'in': {
75 'input_epsilC_path': input_epsilC_path,
76 'input_epsilW_path': input_epsilW_path,
77 'input_zetaC_path': input_zetaC_path,
78 'input_zetaW_path': input_zetaW_path
79 },
80 'out': {
81 'output_csv_path': output_csv_path,
82 'output_jpg_path': output_jpg_path
83 }
84 }
86 self.properties = properties
87 self.sequence = properties.get("sequence")
88 self.seqpos = properties.get("seqpos", None)
90 # Check the properties
91 self.check_properties(properties)
92 self.check_arguments()
94 @launchlogger
95 def launch(self) -> int:
96 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object."""
98 # Setup Biobb
99 if self.check_restart():
100 return 0
101 self.stage_files()
103 # check sequence
104 if self.sequence is None or len(self.sequence) < 2:
105 raise ValueError("sequence is null or too short!")
107 # check seqpos
108 if self.seqpos is not None:
109 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):
110 raise ValueError(
111 f"seqpos values must be between 1 and {len(self.sequence) - 2}")
112 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
113 raise ValueError(
114 "seqpos must be a list of at least two integers")
116 # Creating temporary folder
117 self.tmp_folder = fu.create_unique_dir(prefix="backbone_")
118 fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log)
120 # Copy input_file_path1 to temporary folder
121 shutil.copy(self.io_dict['in']['input_epsilC_path'], self.tmp_folder)
122 shutil.copy(self.io_dict['in']['input_epsilW_path'], self.tmp_folder)
123 shutil.copy(self.io_dict['in']['input_zetaC_path'], self.tmp_folder)
124 shutil.copy(self.io_dict['in']['input_zetaW_path'], self.tmp_folder)
126 # read input files
127 epsilC = read_series(
128 self.io_dict['in']['input_epsilC_path'],
129 usecols=self.seqpos)
130 epsilW = read_series(
131 self.io_dict['in']['input_epsilW_path'],
132 usecols=self.seqpos)
133 zetaC = read_series(
134 self.io_dict['in']['input_zetaC_path'],
135 usecols=self.seqpos)
136 zetaW = read_series(
137 self.io_dict['in']['input_zetaW_path'],
138 usecols=self.seqpos)
140 # calculate difference between epsil and zeta parameters
141 xlabels = self.get_xlabels(
142 self.sequence,
143 inverse_complement(self.sequence))
144 diff_epsil_zeta = self.get_angles_difference(
145 epsilC,
146 zetaC,
147 epsilW,
148 zetaW)
150 # calculate BI population
151 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta)
152 BII = 100 - BI
154 # save table
155 Bpopulations_df = pd.DataFrame({
156 "Nucleotide": xlabels,
157 "BI population": BI,
158 "BII population": BII})
159 Bpopulations_df.to_csv(
160 self.io_dict['out']['output_csv_path'],
161 index=False)
163 # save plot
164 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
165 axs.bar(
166 range(len(xlabels)),
167 BI,
168 label="BI")
169 axs.bar(
170 range(len(xlabels)),
171 BII,
172 bottom=BI,
173 label="BII")
174 # empty bar to divide both sequences
175 axs.bar(
176 [len(BI)//2],
177 [100],
178 color='white',
179 label=None)
180 axs.legend()
181 axs.set_xticks(range(len(xlabels)))
182 axs.set_xticklabels(xlabels, rotation=90)
183 axs.set_xlabel("Nucleotide Sequence")
184 axs.set_ylabel("BI/BII Population (%)")
185 axs.set_title("Nucleotide parameter: BI/BII Population")
186 fig.savefig(
187 self.io_dict['out']['output_jpg_path'],
188 format="jpg")
189 plt.close()
191 # Remove temporary file(s)
192 self.tmp_files.extend([
193 self.stage_io_dict.get("unique_dir"),
194 self.tmp_folder
195 ])
196 self.remove_tmp_files()
198 self.check_arguments(output_files_created=True, raise_exception=False)
200 return 0
202 def get_xlabels(self, strand1, strand2):
203 # get list of tetramers, except first and last two bases
204 labelsW = list(strand1)
205 labelsW[0] = f"{labelsW[0]}5\'"
206 labelsW[-1] = f"{labelsW[-1]}3\'"
207 labelsW = [
208 f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW)+1))]
209 labelsC = list(strand2)[::-1]
210 labelsC[0] = f"{labelsC[0]}5\'"
211 labelsC[-1] = f"{labelsC[-1]}3\'"
212 labelsC = [
213 f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]
215 if self.seqpos is not None:
216 labelsC = [labelsC[i] for i in self.seqpos]
217 labelsW = [labelsW[i] for i in self.seqpos]
218 xlabels = labelsW + ['-'] + labelsC
219 return xlabels
221 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW):
222 # concatenate zeta and epsil arrays
223 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW)))
224 zeta = pd.concat([
225 zetaW,
226 separator_df,
227 zetaC[zetaC.columns[::-1]]],
228 axis=1)
229 epsil = pd.concat([
230 epsilW,
231 separator_df,
232 epsilC[epsilC.columns[::-1]]],
233 axis=1)
235 # difference between epsilon and zeta coordinates
236 diff_epsil_zeta = epsil - zeta
237 return diff_epsil_zeta
240def bipopulations(
241 input_epsilC_path: str, input_epsilW_path: str,
242 input_zetaC_path: str, input_zetaW_path: str,
243 output_csv_path: str, output_jpg_path: str,
244 properties: dict = None, **kwargs) -> int:
245 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and
246 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method. """
248 return BIPopulations(
249 input_epsilC_path=input_epsilC_path,
250 input_epsilW_path=input_epsilW_path,
251 input_zetaC_path=input_zetaC_path,
252 input_zetaW_path=input_zetaW_path,
253 output_csv_path=output_csv_path,
254 output_jpg_path=output_jpg_path,
255 properties=properties, **kwargs).launch()
258def main():
259 """Command line execution of this building block. Please check the command line documentation."""
260 parser = argparse.ArgumentParser(description='Calculate BI/BII populations.',
261 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
262 parser.add_argument('--config', required=False, help='Configuration file')
264 required_args = parser.add_argument_group('required arguments')
265 required_args.add_argument('--input_epsilC_path', required=True,
266 help='Helical parameter <epsilC> input ser file path. Accepted formats: ser.')
267 required_args.add_argument('--input_epsilW_path', required=True,
268 help='Helical parameter <epsilW> input ser file path. Accepted formats: ser.')
269 required_args.add_argument('--input_zetaC_path', required=True,
270 help='Helical parameter <zetaC> input ser file path. Accepted formats: ser.')
271 required_args.add_argument('--input_zetaW_path', required=True,
272 help='Helical parameter <zetaW> input ser file path. Accepted formats: ser.')
273 required_args.add_argument('--output_csv_path', required=True,
274 help='Path to output csv file. Accepted formats: csv.')
275 required_args.add_argument('--output_jpg_path', required=True,
276 help='Path to output jpg file. Accepted formats: jpg.')
278 args = parser.parse_args()
279 args.config = args.config or "{}"
280 properties = settings.ConfReader(config=args.config).get_prop_dic()
282 bipopulations(
283 input_epsilC_path=args.input_epsilC_path,
284 input_epsilW_path=args.input_epsilW_path,
285 input_zetaC_path=args.input_zetaC_path,
286 input_zetaW_path=args.input_zetaW_path,
287 output_csv_path=args.output_csv_path,
288 output_jpg_path=args.output_jpg_path,
289 properties=properties)
292if __name__ == '__main__':
293 main()