Coverage for biobb_dna/backbone/puckering.py: 80%
105 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
1#!/usr/bin/env python3
2"""Module containing the Puckering class and the command line interface."""
4import argparse
5from typing import Optional
7import matplotlib.pyplot as plt
8import numpy as np
9import pandas as pd
10from biobb_common.configuration import settings
11from biobb_common.generic.biobb_object import BiobbObject
12from biobb_common.tools.file_utils import launchlogger
14from biobb_dna.utils.common import _from_string_to_list
15from biobb_dna.utils.loader import read_series
16from biobb_dna.utils.transform import inverse_complement
19class Puckering(BiobbObject):
20 """
21 | biobb_dna Puckering
22 | Calculate Puckering from phase parameters.
23 | Calculate North/East/West/South distribution of sugar puckering backbone torsions.
25 Args:
26 input_phaseC_path (str): Path to .ser file for helical parameter 'phaseC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseC.ser>`_. Accepted formats: ser (edam:format_2330).
27 input_phaseW_path (str): Path to .ser file for helical parameter 'phaseW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseW.ser>`_. Accepted formats: ser (edam:format_2330).
28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).
30 properties (dict):
31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
32 * **stride** (*int*) - (1000) granularity of the number of snapshots for plotting time series.
33 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
34 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
35 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
36 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
38 Examples:
39 This is a use example of how to use the building block from Python::
41 from biobb_dna.backbone.puckering import puckering
43 prop = {
44 'sequence': 'GCAT',
45 }
46 puckering(
47 input_phaseC_path='/path/to/phaseC.ser',
48 input_phaseW_path='/path/to/phaseW.ser',
49 output_csv_path='/path/to/table/output.csv',
50 output_jpg_path='/path/to/table/output.jpg',
51 properties=prop)
52 Info:
53 * wrapped_software:
54 * name: In house
55 * license: Apache-2.0
56 * ontology:
57 * name: EDAM
58 * schema: http://edamontology.org/EDAM.owl
60 """
62 def __init__(
63 self,
64 input_phaseC_path,
65 input_phaseW_path,
66 output_csv_path,
67 output_jpg_path,
68 properties=None,
69 **kwargs,
70 ) -> None:
71 properties = properties or {}
73 # Call parent class constructor
74 super().__init__(properties)
75 self.locals_var_dict = locals().copy()
77 # Input/Output files
78 self.io_dict = {
79 "in": {
80 "input_phaseC_path": input_phaseC_path,
81 "input_phaseW_path": input_phaseW_path,
82 },
83 "out": {
84 "output_csv_path": output_csv_path,
85 "output_jpg_path": output_jpg_path,
86 },
87 }
89 self.properties = properties
90 self.sequence = properties.get("sequence")
91 self.stride = properties.get("stride", 1000)
92 self.seqpos = [
93 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
94 ]
96 # Check the properties
97 self.check_properties(properties)
98 self.check_arguments()
100 @launchlogger
101 def launch(self) -> int:
102 """Execute the :class:`Puckering <backbone.puckering.Puckering>` object."""
104 # Setup Biobb
105 if self.check_restart():
106 return 0
107 self.stage_files()
109 # check sequence
110 if self.sequence is None or len(self.sequence) < 2:
111 raise ValueError("sequence is null or too short!")
113 # check seqpos
114 if self.seqpos:
115 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):
116 raise ValueError(
117 f"seqpos values must be between 1 and {len(self.sequence) - 2}"
118 )
119 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
120 raise ValueError("seqpos must be a list of at least two integers")
121 else:
122 self.seqpos = None # type: ignore
124 # read input files
125 phaseC = read_series(
126 self.stage_io_dict["in"]["input_phaseC_path"], usecols=self.seqpos
127 )
128 phaseW = read_series(
129 self.stage_io_dict["in"]["input_phaseW_path"], usecols=self.seqpos
130 )
132 # fix angle range so its not negative
133 phaseC = self.fix_angles(phaseC)
134 phaseW = self.fix_angles(phaseW)
136 # calculate difference between epsil and zeta parameters
137 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence))
138 Npop, Epop, Wpop, Spop = self.check_puckering(phaseC, phaseW)
140 # save plot
141 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
142 axs.bar(range(len(xlabels)), Npop, label="North")
143 axs.bar(range(len(xlabels)), Epop, bottom=Npop, label="East")
144 axs.bar(range(len(xlabels)), Spop, bottom=Npop + Epop, label="South")
145 axs.bar(range(len(xlabels)), Wpop, bottom=Npop + Epop + Spop, label="West")
146 # empty bar to divide both sequences
147 axs.bar([len(phaseC.columns)], [100], color="white", label=None)
148 axs.legend()
149 axs.set_xticks(range(len(xlabels)))
150 axs.set_xticklabels(xlabels, rotation=90)
151 axs.set_xlabel("Nucleotide Sequence")
152 axs.set_ylabel("Puckering (%)")
153 axs.set_title("Nucleotide parameter: Puckering")
154 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
156 # save table
157 populations = pd.DataFrame(
158 {
159 "Nucleotide": xlabels,
160 "North": Npop,
161 "East": Epop,
162 "West": Wpop,
163 "South": Spop,
164 }
165 )
166 populations.to_csv(self.stage_io_dict["out"]["output_csv_path"], index=False)
168 plt.close()
170 # Copy files to host
171 self.copy_to_host()
173 # Remove temporary file(s)
174 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")])
175 self.remove_tmp_files()
177 self.check_arguments(output_files_created=True, raise_exception=False)
179 return 0
181 def get_xlabels(self, strand1, strand2):
182 # get list of tetramers, except first and last two bases
183 labelsW = list(strand1)
184 labelsW[0] = f"{labelsW[0]}5'"
185 labelsW[-1] = f"{labelsW[-1]}3'"
186 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))]
187 labelsC = list(strand2)[::-1]
188 labelsC[0] = f"{labelsC[0]}5'"
189 labelsC[-1] = f"{labelsC[-1]}3'"
190 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]
192 if self.seqpos:
193 labelsC = [labelsC[i] for i in self.seqpos]
194 labelsW = [labelsW[i] for i in self.seqpos]
195 xlabels = labelsW + ["-"] + labelsC
196 return xlabels
198 def check_puckering(self, phaseC, phaseW):
199 separator_df = pd.DataFrame({"-": np.nan}, index=range(1, len(phaseC)))
200 phase = pd.concat([phaseW, separator_df, phaseC[phaseC.columns[::-1]]], axis=1)
201 # phase.columns = columns
203 Npop = np.logical_or(phase > 315, phase < 45).mean() * 100
204 Epop = np.logical_and(phase > 45, phase < 135).mean() * 100
205 Wpop = np.logical_and(phase > 225, phase < 315).mean() * 100
206 Spop = np.logical_and(phase > 135, phase < 225).mean() * 100
207 return Npop, Epop, Wpop, Spop
209 def fix_angles(self, dataset):
210 values = np.where(dataset < 0, dataset + 360, dataset)
211 # values = np.where(values > 360, values - 360, values)
212 dataset = pd.DataFrame(values)
213 return dataset
216def puckering(
217 input_phaseC_path: str,
218 input_phaseW_path: str,
219 output_csv_path: str,
220 output_jpg_path: str,
221 properties: Optional[dict] = None,
222 **kwargs,
223) -> int:
224 """Create :class:`Puckering <dna.backbone.puckering.Puckering>` class and
225 execute the: meth: `launch() <dna.backbone.puckering.Puckering.launch>` method."""
227 return Puckering(
228 input_phaseC_path=input_phaseC_path,
229 input_phaseW_path=input_phaseW_path,
230 output_csv_path=output_csv_path,
231 output_jpg_path=output_jpg_path,
232 properties=properties,
233 **kwargs,
234 ).launch()
236 puckering.__doc__ = Puckering.__doc__
239def main():
240 """Command line execution of this building block. Please check the command line documentation."""
241 parser = argparse.ArgumentParser(
242 description="Calculate North/East/West/South distribution of sugar puckering backbone torsions.",
243 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999),
244 )
245 parser.add_argument("--config", required=False, help="Configuration file")
247 required_args = parser.add_argument_group("required arguments")
248 required_args.add_argument(
249 "--input_phaseC_path",
250 required=True,
251 help="Helical parameter <alphaC> input ser file path. Accepted formats: ser.",
252 )
253 required_args.add_argument(
254 "--input_phaseW_path",
255 required=True,
256 help="Helical parameter <alphaW> input ser file path. Accepted formats: ser.",
257 )
258 required_args.add_argument(
259 "--output_csv_path",
260 required=True,
261 help="Path to output csv file. Accepted formats: csv.",
262 )
263 required_args.add_argument(
264 "--output_jpg_path",
265 required=True,
266 help="Path to output jpg file. Accepted formats: jpg.",
267 )
269 args = parser.parse_args()
270 args.config = args.config or "{}"
271 properties = settings.ConfReader(config=args.config).get_prop_dic()
273 puckering(
274 input_phaseC_path=args.input_phaseC_path,
275 input_phaseW_path=args.input_phaseW_path,
276 output_csv_path=args.output_csv_path,
277 output_jpg_path=args.output_jpg_path,
278 properties=properties,
279 )
282if __name__ == "__main__":
283 main()