Coverage for biobb_dna / backbone / puckering.py: 90%
92 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 18:49 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 18:49 +0000
1#!/usr/bin/env python3
2"""Module containing the Puckering class and the command line interface."""
4from typing import Optional
6import matplotlib.pyplot as plt
7import numpy as np
8import pandas as pd
9from biobb_common.generic.biobb_object import BiobbObject
10from biobb_common.tools.file_utils import launchlogger
12from biobb_dna.utils.common import _from_string_to_list
13from biobb_dna.utils.loader import read_series
14from biobb_dna.utils.transform import inverse_complement
17class Puckering(BiobbObject):
18 """
19 | biobb_dna Puckering
20 | Calculate Puckering from phase parameters.
21 | Calculate North/East/West/South distribution of sugar puckering backbone torsions.
23 Args:
24 input_phaseC_path (str): Path to .ser file for helical parameter 'phaseC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseC.ser>`_. Accepted formats: ser (edam:format_2330).
25 input_phaseW_path (str): Path to .ser file for helical parameter 'phaseW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseW.ser>`_. Accepted formats: ser (edam:format_2330).
26 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.csv>`_. Accepted formats: csv (edam:format_3752).
27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).
28 properties (dict):
29 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
30 * **stride** (*int*) - (1000) granularity of the number of snapshots for plotting time series.
31 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
32 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
33 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
34 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
36 Examples:
37 This is a use example of how to use the building block from Python::
39 from biobb_dna.backbone.puckering import puckering
41 prop = {
42 'sequence': 'GCAT',
43 }
44 puckering(
45 input_phaseC_path='/path/to/phaseC.ser',
46 input_phaseW_path='/path/to/phaseW.ser',
47 output_csv_path='/path/to/table/output.csv',
48 output_jpg_path='/path/to/table/output.jpg',
49 properties=prop)
50 Info:
51 * wrapped_software:
52 * name: In house
53 * license: Apache-2.0
54 * ontology:
55 * name: EDAM
56 * schema: http://edamontology.org/EDAM.owl
58 """
60 def __init__(
61 self,
62 input_phaseC_path,
63 input_phaseW_path,
64 output_csv_path,
65 output_jpg_path,
66 properties=None,
67 **kwargs,
68 ) -> None:
69 properties = properties or {}
71 # Call parent class constructor
72 super().__init__(properties)
73 self.locals_var_dict = locals().copy()
75 # Input/Output files
76 self.io_dict = {
77 "in": {
78 "input_phaseC_path": input_phaseC_path,
79 "input_phaseW_path": input_phaseW_path,
80 },
81 "out": {
82 "output_csv_path": output_csv_path,
83 "output_jpg_path": output_jpg_path,
84 },
85 }
87 self.properties = properties
88 self.sequence = properties.get("sequence")
89 self.stride = properties.get("stride", 1000)
90 self.seqpos = [
91 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
92 ]
94 # Check the properties
95 self.check_properties(properties)
96 self.check_arguments()
98 @launchlogger
99 def launch(self) -> int:
100 """Execute the :class:`Puckering <backbone.puckering.Puckering>` object."""
102 # Setup Biobb
103 if self.check_restart():
104 return 0
105 self.stage_files()
107 # check sequence
108 if self.sequence is None or len(self.sequence) < 2:
109 raise ValueError("sequence is null or too short!")
111 # check seqpos
112 if self.seqpos:
113 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):
114 raise ValueError(
115 f"seqpos values must be between 1 and {len(self.sequence) - 2}"
116 )
117 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
118 raise ValueError("seqpos must be a list of at least two integers")
119 else:
120 self.seqpos = None # type: ignore
122 # read input files
123 phaseC = read_series(
124 self.stage_io_dict["in"]["input_phaseC_path"], usecols=self.seqpos
125 )
126 phaseW = read_series(
127 self.stage_io_dict["in"]["input_phaseW_path"], usecols=self.seqpos
128 )
130 # fix angle range so its not negative
131 phaseC = self.fix_angles(phaseC)
132 phaseW = self.fix_angles(phaseW)
134 # calculate difference between epsil and zeta parameters
135 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence))
136 Npop, Epop, Wpop, Spop = self.check_puckering(phaseC, phaseW)
138 # save plot
139 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
140 axs.bar(range(len(xlabels)), Npop, label="North")
141 axs.bar(range(len(xlabels)), Epop, bottom=Npop, label="East")
142 axs.bar(range(len(xlabels)), Spop, bottom=Npop + Epop, label="South")
143 axs.bar(range(len(xlabels)), Wpop, bottom=Npop + Epop + Spop, label="West")
144 # empty bar to divide both sequences
145 axs.bar([len(phaseC.columns)], [100], color="white", label=None)
146 axs.legend()
147 axs.set_xticks(range(len(xlabels)))
148 axs.set_xticklabels(xlabels, rotation=90)
149 axs.set_xlabel("Nucleotide Sequence")
150 axs.set_ylabel("Puckering (%)")
151 axs.set_title("Nucleotide parameter: Puckering")
152 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
154 # save table
155 populations = pd.DataFrame(
156 {
157 "Nucleotide": xlabels,
158 "North": Npop,
159 "East": Epop,
160 "West": Wpop,
161 "South": Spop,
162 }
163 )
164 populations.to_csv(self.stage_io_dict["out"]["output_csv_path"], index=False)
166 plt.close()
168 # Copy files to host
169 self.copy_to_host()
171 # Remove temporary file(s)
172 self.remove_tmp_files()
174 self.check_arguments(output_files_created=True, raise_exception=False)
176 return 0
178 def get_xlabels(self, strand1, strand2):
179 # get list of tetramers, except first and last two bases
180 labelsW = list(strand1)
181 labelsW[0] = f"{labelsW[0]}5'"
182 labelsW[-1] = f"{labelsW[-1]}3'"
183 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))]
184 labelsC = list(strand2)[::-1]
185 labelsC[0] = f"{labelsC[0]}5'"
186 labelsC[-1] = f"{labelsC[-1]}3'"
187 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]
189 if self.seqpos:
190 labelsC = [labelsC[i] for i in self.seqpos]
191 labelsW = [labelsW[i] for i in self.seqpos]
192 xlabels = labelsW + ["-"] + labelsC
193 return xlabels
195 def check_puckering(self, phaseC, phaseW):
196 separator_df = pd.DataFrame({"-": np.nan}, index=range(1, len(phaseC)))
197 phase = pd.concat([phaseW, separator_df, phaseC[phaseC.columns[::-1]]], axis=1)
198 # phase.columns = columns
200 Npop = np.logical_or(phase > 315, phase < 45).mean() * 100
201 Epop = np.logical_and(phase > 45, phase < 135).mean() * 100
202 Wpop = np.logical_and(phase > 225, phase < 315).mean() * 100
203 Spop = np.logical_and(phase > 135, phase < 225).mean() * 100
204 return Npop, Epop, Wpop, Spop
206 def fix_angles(self, dataset):
207 values = np.where(dataset < 0, dataset + 360, dataset)
208 # values = np.where(values > 360, values - 360, values)
209 dataset = pd.DataFrame(values)
210 return dataset
213def puckering(
214 input_phaseC_path: str,
215 input_phaseW_path: str,
216 output_csv_path: str,
217 output_jpg_path: str,
218 properties: Optional[dict] = None,
219 **kwargs,
220) -> int:
221 """Create :class:`Puckering <dna.backbone.puckering.Puckering>` class and
222 execute the: meth: `launch() <dna.backbone.puckering.Puckering.launch>` method."""
223 return Puckering(**dict(locals())).launch()
226puckering.__doc__ = Puckering.__doc__
227main = Puckering.get_main(puckering, "Calculate North/East/West/South distribution of sugar puckering backbone torsions.")
229if __name__ == "__main__":
230 main()