Coverage for biobb_dna/intrabp_correlations/intraseqcorr.py: 71%
96 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 10:36 +0000
1#!/usr/bin/env python3
3"""Module containing the IntraSequenceCorrelation class and the command line interface."""
5import argparse
6from pathlib import Path
7from typing import Optional
9import matplotlib.pyplot as plt
10import numpy as np
11from biobb_common.configuration import settings
12from biobb_common.generic.biobb_object import BiobbObject
13from biobb_common.tools.file_utils import launchlogger
15from biobb_dna.utils import constants
16from biobb_dna.utils.common import _from_string_to_list
17from biobb_dna.utils.loader import read_series
20class IntraSequenceCorrelation(BiobbObject):
21 """
22 | biobb_dna IntraSequenceCorrelation
23 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter.
24 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter.
26 Args:
27 input_ser_path (str): Path to .ser file with data for single helical parameter. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/correlation/canal_output_buckle.ser>`_. Accepted formats: ser (edam:format_2330).
28 output_csv_path (str): Path to directory where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.csv>`_. Accepted formats: csv (edam:format_3752).
29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.jpg>`_. Accepted formats: jpg (edam:format_3579).
30 properties (dict):
31 * **sequence** (*str*) - (None) Nucleic acid sequence for the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
32 * **helpar_name** (*str*) - (None) helical parameter name to add to plot title.
33 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
34 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
35 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
36 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
38 Examples:
39 This is a use example of how to use the building block from Python::
41 from biobb_dna.intrabp_correlations.intraseqcorr import intraseqcorr
43 intraseqcorr(
44 input_ser_path='path/to/input/file.ser',
45 output_csv_path='path/to/output/file.csv',
46 output_jpg_path='path/to/output/plot.jpg',
47 properties=prop)
48 Info:
49 * wrapped_software:
50 * name: In house
51 * license: Apache-2.0
52 * ontology:
53 * name: EDAM
54 * schema: http://edamontology.org/EDAM.owl
56 """
58 def __init__(
59 self,
60 input_ser_path,
61 output_csv_path,
62 output_jpg_path,
63 properties=None,
64 **kwargs,
65 ) -> None:
66 properties = properties or {}
68 # Call parent class constructor
69 super().__init__(properties)
70 self.locals_var_dict = locals().copy()
72 # Input/Output files
73 self.io_dict = {
74 "in": {"input_ser_path": input_ser_path},
75 "out": {
76 "output_csv_path": output_csv_path,
77 "output_jpg_path": output_jpg_path,
78 },
79 }
81 self.properties = properties
82 self.sequence = properties.get("sequence", None)
83 self.seqpos = [
84 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
85 ]
86 self.helpar_name = properties.get("helpar_name", None)
88 # Check the properties
89 self.check_properties(properties)
90 self.check_arguments()
92 @launchlogger
93 def launch(self) -> int:
94 """Execute the :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` object."""
96 # Setup Biobb
97 if self.check_restart():
98 return 0
99 self.stage_files()
101 # check sequence
102 if self.sequence is None or len(self.sequence) < 2:
103 raise ValueError("sequence is null or too short!")
105 # get helical parameter from filename if not specified
106 if self.helpar_name is None:
107 for hp in constants.helical_parameters:
108 if (
109 hp.lower()
110 in Path(self.stage_io_dict["in"]["input_ser_path"]).name.lower()
111 ):
112 self.helpar_name = hp
113 if self.helpar_name is None:
114 raise ValueError(
115 "Helical parameter name can't be inferred from file, "
116 "so it must be specified!"
117 )
118 else:
119 if self.helpar_name not in constants.helical_parameters:
120 raise ValueError(
121 "Helical parameter name is invalid! "
122 f"Options: {constants.helical_parameters}"
123 )
125 # get base length and unit from helical parameter name
126 if self.helpar_name in constants.hp_angular:
127 self.method = "pearson"
128 else:
129 self.method = self.circular # type: ignore
131 # check seqpos
132 if self.seqpos:
133 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
134 raise ValueError("seqpos must be a list of at least two integers")
135 else:
136 self.seqpos = None # type: ignore
138 # read input .ser file
139 ser_data = read_series(
140 self.stage_io_dict["in"]["input_ser_path"], usecols=self.seqpos
141 )
142 if not self.seqpos:
143 ser_data = ser_data[ser_data.columns[1:-1]]
144 # discard first and last base(pairs) from strands
145 sequence = self.sequence[1:]
146 labels = [f"{i+1}_{sequence[i:i+1]}" for i in range(len(ser_data.columns))]
147 else:
148 labels = [f"{i+1}_{self.sequence[i:i+1]}" for i in self.seqpos]
149 ser_data.columns = labels
151 # make matrix
152 corr_data = ser_data.corr(method=self.method)
154 # save csv data
155 corr_data.to_csv(self.stage_io_dict["out"]["output_csv_path"])
157 # create heatmap
158 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
159 axs.pcolor(corr_data)
160 # Loop over data dimensions and create text annotations.
161 for i in range(len(corr_data)):
162 for j in range(len(corr_data)):
163 axs.text(
164 j + 0.5,
165 i + 0.5,
166 f"{corr_data[corr_data.columns[j]].iloc[i]:.2f}",
167 ha="center",
168 va="center",
169 color="w",
170 )
171 axs.set_xticks([i + 0.5 for i in range(len(corr_data))])
172 axs.set_xticklabels(labels, rotation=90)
173 axs.set_yticks([i + 0.5 for i in range(len(corr_data))])
174 axs.set_yticklabels(labels)
175 axs.set_title(
176 "Base Pair Correlation " f"for Helical Parameter '{self.helpar_name}'"
177 )
178 fig.tight_layout()
179 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
180 plt.close()
182 # Copy files to host
183 self.copy_to_host()
185 # Remove temporary file(s)
186 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")])
187 self.remove_tmp_files()
189 self.check_arguments(output_files_created=True, raise_exception=False)
191 return 0
193 @staticmethod
194 def circular(x1, x2):
195 x1 = x1 * np.pi / 180
196 x2 = x2 * np.pi / 180
197 diff_1 = np.sin(x1 - x1.mean())
198 diff_2 = np.sin(x2 - x2.mean())
199 num = (diff_1 * diff_2).sum()
200 den = np.sqrt((diff_1**2).sum() * (diff_2**2).sum())
201 return num / den
204def intraseqcorr(
205 input_ser_path: str,
206 output_csv_path: str,
207 output_jpg_path: str,
208 properties: Optional[dict] = None,
209 **kwargs,
210) -> int:
211 """Create :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` class and
212 execute the :meth:`launch() <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation.launch>` method."""
214 return IntraSequenceCorrelation(
215 input_ser_path=input_ser_path,
216 output_csv_path=output_csv_path,
217 output_jpg_path=output_jpg_path,
218 properties=properties,
219 **kwargs,
220 ).launch()
222 intraseqcorr.__doc__ = IntraSequenceCorrelation.__doc__
225def main():
226 """Command line execution of this building block. Please check the command line documentation."""
227 parser = argparse.ArgumentParser(
228 description="Load .ser file from Canal output and calculate correlation between base pairs of the corresponding sequence.",
229 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999),
230 )
231 parser.add_argument("--config", required=False, help="Configuration file")
233 required_args = parser.add_argument_group("required arguments")
234 required_args.add_argument(
235 "--input_ser_path",
236 required=True,
237 help="Path to ser file with inputs. Accepted formats: ser.",
238 )
239 required_args.add_argument(
240 "--output_csv_path",
241 required=True,
242 help="Path to output file. Accepted formats: csv.",
243 )
244 required_args.add_argument(
245 "--output_jpg_path",
246 required=True,
247 help="Path to output plot. Accepted formats: jpg.",
248 )
250 args = parser.parse_args()
251 args.config = args.config or "{}"
252 properties = settings.ConfReader(config=args.config).get_prop_dic()
254 intraseqcorr(
255 input_ser_path=args.input_ser_path,
256 output_csv_path=args.output_csv_path,
257 output_jpg_path=args.output_jpg_path,
258 properties=properties,
259 )
262if __name__ == "__main__":
263 main()