Coverage for biobb_dna / intrabp_correlations / intraseqcorr.py: 80%
84 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 18:49 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 18:49 +0000
1#!/usr/bin/env python3
3"""Module containing the IntraSequenceCorrelation class and the command line interface."""
5from pathlib import Path
6from typing import Optional
8import matplotlib.pyplot as plt
9import numpy as np
10from biobb_common.generic.biobb_object import BiobbObject
11from biobb_common.tools.file_utils import launchlogger
13from biobb_dna.utils import constants
14from biobb_dna.utils.common import _from_string_to_list
15from biobb_dna.utils.loader import read_series
18class IntraSequenceCorrelation(BiobbObject):
19 """
20 | biobb_dna IntraSequenceCorrelation
21 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter.
22 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter.
24 Args:
25 input_ser_path (str): Path to .ser file with data for single helical parameter. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/correlation/canal_output_buckle.ser>`_. Accepted formats: ser (edam:format_2330).
26 output_csv_path (str): Path to directory where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.csv>`_. Accepted formats: csv (edam:format_3752).
27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.jpg>`_. Accepted formats: jpg (edam:format_3579).
28 properties (dict):
29 * **sequence** (*str*) - (None) Nucleic acid sequence for the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
30 * **helpar_name** (*str*) - (None) helical parameter name to add to plot title.
31 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
32 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
33 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
34 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
36 Examples:
37 This is a use example of how to use the building block from Python::
39 from biobb_dna.intrabp_correlations.intraseqcorr import intraseqcorr
41 intraseqcorr(
42 input_ser_path='path/to/input/file.ser',
43 output_csv_path='path/to/output/file.csv',
44 output_jpg_path='path/to/output/plot.jpg',
45 properties=prop)
46 Info:
47 * wrapped_software:
48 * name: In house
49 * license: Apache-2.0
50 * ontology:
51 * name: EDAM
52 * schema: http://edamontology.org/EDAM.owl
54 """
56 def __init__(
57 self,
58 input_ser_path,
59 output_csv_path,
60 output_jpg_path,
61 properties=None,
62 **kwargs,
63 ) -> None:
64 properties = properties or {}
66 # Call parent class constructor
67 super().__init__(properties)
68 self.locals_var_dict = locals().copy()
70 # Input/Output files
71 self.io_dict = {
72 "in": {"input_ser_path": input_ser_path},
73 "out": {
74 "output_csv_path": output_csv_path,
75 "output_jpg_path": output_jpg_path,
76 },
77 }
79 self.properties = properties
80 self.sequence = properties.get("sequence", None)
81 self.seqpos = [
82 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
83 ]
84 self.helpar_name = properties.get("helpar_name", None)
86 # Check the properties
87 self.check_properties(properties)
88 self.check_arguments()
90 @launchlogger
91 def launch(self) -> int:
92 """Execute the :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` object."""
94 # Setup Biobb
95 if self.check_restart():
96 return 0
97 self.stage_files()
99 # check sequence
100 if self.sequence is None or len(self.sequence) < 2:
101 raise ValueError("sequence is null or too short!")
103 # get helical parameter from filename if not specified
104 if self.helpar_name is None:
105 for hp in constants.helical_parameters:
106 if (
107 hp.lower()
108 in Path(self.stage_io_dict["in"]["input_ser_path"]).name.lower()
109 ):
110 self.helpar_name = hp
111 if self.helpar_name is None:
112 raise ValueError(
113 "Helical parameter name can't be inferred from file, "
114 "so it must be specified!"
115 )
116 else:
117 if self.helpar_name not in constants.helical_parameters:
118 raise ValueError(
119 "Helical parameter name is invalid! "
120 f"Options: {constants.helical_parameters}"
121 )
123 # get base length and unit from helical parameter name
124 if self.helpar_name in constants.hp_angular:
125 self.method = "pearson"
126 else:
127 self.method = self.circular # type: ignore
129 # check seqpos
130 if self.seqpos:
131 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
132 raise ValueError("seqpos must be a list of at least two integers")
133 else:
134 self.seqpos = None # type: ignore
136 # read input .ser file
137 ser_data = read_series(
138 self.stage_io_dict["in"]["input_ser_path"], usecols=self.seqpos
139 )
140 if not self.seqpos:
141 ser_data = ser_data[ser_data.columns[1:-1]]
142 # discard first and last base(pairs) from strands
143 sequence = self.sequence[1:]
144 labels = [f"{i+1}_{sequence[i:i+1]}" for i in range(len(ser_data.columns))]
145 else:
146 labels = [f"{i+1}_{self.sequence[i:i+1]}" for i in self.seqpos]
147 ser_data.columns = labels
149 # make matrix
150 corr_data = ser_data.corr(method=self.method)
152 # save csv data
153 corr_data.to_csv(self.stage_io_dict["out"]["output_csv_path"])
155 # create heatmap
156 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
157 axs.pcolor(corr_data)
158 # Loop over data dimensions and create text annotations.
159 for i in range(len(corr_data)):
160 for j in range(len(corr_data)):
161 axs.text(
162 j + 0.5,
163 i + 0.5,
164 f"{corr_data[corr_data.columns[j]].iloc[i]:.2f}",
165 ha="center",
166 va="center",
167 color="w",
168 )
169 axs.set_xticks([i + 0.5 for i in range(len(corr_data))])
170 axs.set_xticklabels(labels, rotation=90)
171 axs.set_yticks([i + 0.5 for i in range(len(corr_data))])
172 axs.set_yticklabels(labels)
173 axs.set_title(
174 "Base Pair Correlation " f"for Helical Parameter '{self.helpar_name}'"
175 )
176 fig.tight_layout()
177 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
178 plt.close()
180 # Copy files to host
181 self.copy_to_host()
183 # Remove temporary file(s)
184 self.remove_tmp_files()
186 self.check_arguments(output_files_created=True, raise_exception=False)
188 return 0
190 @staticmethod
191 def circular(x1, x2):
192 x1 = x1 * np.pi / 180
193 x2 = x2 * np.pi / 180
194 diff_1 = np.sin(x1 - x1.mean())
195 diff_2 = np.sin(x2 - x2.mean())
196 num = (diff_1 * diff_2).sum()
197 den = np.sqrt((diff_1**2).sum() * (diff_2**2).sum())
198 return num / den
201def intraseqcorr(
202 input_ser_path: str,
203 output_csv_path: str,
204 output_jpg_path: str,
205 properties: Optional[dict] = None,
206 **kwargs,
207) -> int:
208 """Create :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` class and
209 execute the :meth:`launch() <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation.launch>` method."""
210 return IntraSequenceCorrelation(**dict(locals())).launch()
213intraseqcorr.__doc__ = IntraSequenceCorrelation.__doc__
214main = IntraSequenceCorrelation.get_main(intraseqcorr, "Load .ser file from Canal output and calculate correlation between base pairs of the corresponding sequence.")
216if __name__ == '__main__':
217 main()