Coverage for biobb_dna / backbone / canonicalag.py: 91%
95 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:05 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:05 +0000
1#!/usr/bin/env python3
2"""Module containing the CanonicalAG class and the command line interface."""
4from typing import Optional
6import matplotlib.pyplot as plt
7import numpy as np
8import pandas as pd
9from biobb_common.generic.biobb_object import BiobbObject
10from biobb_common.tools.file_utils import launchlogger
12from biobb_dna.utils.common import _from_string_to_list
13from biobb_dna.utils.loader import read_series
14from biobb_dna.utils.transform import inverse_complement
17class CanonicalAG(BiobbObject):
18 """
19 | biobb_dna CanonicalAG
20 | Calculate Canonical Alpha/Gamma populations from alpha and gamma parameters.
21 | Calculate Canonical Alpha/Gamma populations from alpha and gamma parameters.
23 Args:
24 input_alphaC_path (str): Path to .ser file for helical parameter 'alphaC'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_alphaC.ser>`_. Accepted formats: ser (edam:format_2330).
25 input_alphaW_path (str): Path to .ser file for helical parameter 'alphaW'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_alphaW.ser>`_. Accepted formats: ser (edam:format_2330).
26 input_gammaC_path (str): Path to .ser file for helical parameter 'gammaC'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_gammaC.ser>`_. Accepted formats: ser (edam:format_2330).
27 input_gammaW_path (str): Path to .ser file for helical parameter 'gammaW'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_gammaW.ser>`_. Accepted formats: ser (edam:format_2330).
28 output_csv_path (str): Path to .csv file where output is saved. File type: output. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/canonag_ref.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/canonag_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).
30 properties (dict):
31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
35 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
37 Examples:
38 This is a use example of how to use the building block from Python::
40 from biobb_dna.backbone.canonicalag import canonicalag
42 prop = {
43 'seqpos': [1,2],
44 'sequence': 'GCAT',
45 }
46 canonicalag(
47 input_alphaC_path='/path/to/alphaC.ser',
48 input_alphaW_path='/path/to/alphaW.ser',
49 input_gammaC_path='/path/to/gammaC.ser',
50 input_gammaW_path='/path/to/gammaW.ser',
51 output_csv_path='/path/to/table/output.csv',
52 output_jpg_path='/path/to/table/output.jpg',
53 properties=prop)
54 Info:
55 * wrapped_software:
56 * name: In house
57 * license: Apache-2.0
58 * ontology:
59 * name: EDAM
60 * schema: http://edamontology.org/EDAM.owl
62 """
64 def __init__(
65 self,
66 input_alphaC_path,
67 input_alphaW_path,
68 input_gammaC_path,
69 input_gammaW_path,
70 output_csv_path,
71 output_jpg_path,
72 properties=None,
73 **kwargs,
74 ) -> None:
75 properties = properties or {}
77 # Call parent class constructor
78 super().__init__(properties)
79 self.locals_var_dict = locals().copy()
81 # Input/Output files
82 self.io_dict = {
83 "in": {
84 "input_alphaC_path": input_alphaC_path,
85 "input_alphaW_path": input_alphaW_path,
86 "input_gammaC_path": input_gammaC_path,
87 "input_gammaW_path": input_gammaW_path,
88 },
89 "out": {
90 "output_csv_path": output_csv_path,
91 "output_jpg_path": output_jpg_path,
92 },
93 }
95 self.properties = properties
96 self.sequence = properties.get("sequence")
97 self.stride = properties.get("stride", 1000)
98 self.seqpos = [
99 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
100 ]
102 # Check the properties
103 self.check_properties(properties)
104 self.check_arguments()
106 @launchlogger
107 def launch(self) -> int:
108 """Execute the :class:`CanonicalAG <backbone.canonicalag.CanonicalAG>` object."""
110 # Setup Biobb
111 if self.check_restart():
112 return 0
113 self.stage_files()
115 # check sequence
116 if self.sequence is None or len(self.sequence) < 2:
117 raise ValueError("sequence is null or too short!")
119 # check seqpos
120 if self.seqpos:
121 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):
122 raise ValueError(
123 f"seqpos values must be between 1 and {len(self.sequence) - 2}"
124 )
125 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
126 raise ValueError("seqpos must be a list of at least two integers")
127 else:
128 self.seqpos = None # type: ignore
130 # read input files
131 alphaC = read_series(
132 self.stage_io_dict["in"]["input_alphaC_path"], usecols=self.seqpos
133 )
134 alphaW = read_series(
135 self.stage_io_dict["in"]["input_alphaW_path"], usecols=self.seqpos
136 )
137 gammaC = read_series(
138 self.stage_io_dict["in"]["input_gammaC_path"], usecols=self.seqpos
139 )
140 gammaW = read_series(
141 self.stage_io_dict["in"]["input_gammaW_path"], usecols=self.seqpos
142 )
144 # fix angle range so its not negative
145 alphaC = self.fix_angles(alphaC)
146 alphaW = self.fix_angles(alphaW)
147 gammaC = self.fix_angles(gammaC)
148 gammaW = self.fix_angles(gammaW)
150 # calculate difference between epsil and zeta parameters
151 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence))
152 canonical_populations = self.check_alpha_gamma(alphaC, gammaC, alphaW, gammaW)
154 # save table
155 canonical_populations.name = "Canonical alpha/gamma"
156 ag_populations_df = pd.DataFrame(
157 {"Nucleotide": xlabels, "Canonical alpha/gamma": canonical_populations}
158 )
159 ag_populations_df.to_csv(
160 self.stage_io_dict["out"]["output_csv_path"], index=False
161 )
163 # save plot
164 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
165 axs.bar(
166 range(len(xlabels)), canonical_populations, label="canonical alpha/gamma"
167 )
168 axs.bar(
169 range(len(xlabels)),
170 100 - canonical_populations,
171 bottom=canonical_populations,
172 label=None,
173 )
174 # empty bar to divide both sequences
175 axs.bar([len(alphaC.columns)], [100], color="white", label=None)
176 axs.legend()
177 axs.set_xticks(range(len(xlabels)))
178 axs.set_xticklabels(xlabels, rotation=90)
179 axs.set_xlabel("Nucleotide Sequence")
180 axs.set_ylabel("Canonical Alpha-Gamma (%)")
181 axs.set_title("Nucleotide parameter: Canonical Alpha-Gamma")
182 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
183 plt.close()
185 # Copy files to host
186 self.copy_to_host()
188 # Remove temporary file(s)
189 self.remove_tmp_files()
191 return 0
193 def get_xlabels(self, strand1, strand2):
194 # get list of tetramers, except first and last two bases
195 labelsW = list(strand1)
196 labelsW[0] = f"{labelsW[0]}5'"
197 labelsW[-1] = f"{labelsW[-1]}3'"
198 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))]
199 labelsC = list(strand2)[::-1]
200 labelsC[0] = f"{labelsC[0]}5'"
201 labelsC[-1] = f"{labelsC[-1]}3'"
202 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]
204 if self.seqpos:
205 labelsC = [labelsC[i] for i in self.seqpos]
206 labelsW = [labelsW[i] for i in self.seqpos]
207 xlabels = labelsW + ["-"] + labelsC
208 return xlabels
210 def check_alpha_gamma(self, alphaC, gammaC, alphaW, gammaW):
211 separator_df = pd.DataFrame({"-": np.nan}, index=range(len(gammaW)))
212 gamma = pd.concat([gammaW, separator_df, gammaC[gammaC.columns[::-1]]], axis=1)
213 alpha = pd.concat([alphaW, separator_df, alphaC[alphaC.columns[::-1]]], axis=1)
215 alpha_filter = np.logical_and(alpha > 240, alpha < 360)
216 gamma_filter = np.logical_and(gamma > 0, gamma < 120)
217 canonical_alpha_gamma = np.logical_and(alpha_filter, gamma_filter).mean() * 100
219 return canonical_alpha_gamma
221 def fix_angles(self, dataset):
222 values = np.where(dataset < 0, dataset + 360, dataset)
223 values = np.where(values > 360, values - 360, values)
224 dataset = pd.DataFrame(values)
225 return dataset
228def canonicalag(
229 input_alphaC_path: str,
230 input_alphaW_path: str,
231 input_gammaC_path: str,
232 input_gammaW_path: str,
233 output_csv_path: str,
234 output_jpg_path: str,
235 properties: Optional[dict] = None,
236 **kwargs,
237) -> int:
238 """Create :class:`CanonicalAG <dna.backbone.canonicalag.CanonicalAG>` class and
239 execute the: meth: `launch() <dna.backbone.canonicalag.CanonicalAG.launch>` method."""
240 return CanonicalAG(**dict(locals())).launch()
243canonicalag.__doc__ = CanonicalAG.__doc__
244main = CanonicalAG.get_main(canonicalag, "Calculate Canonical Alpha/Gamma distributions.")
246if __name__ == "__main__":
247 main()