Coverage for biobb_dna / interbp_correlations / interseqcorr.py: 80%
84 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 18:49 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 18:49 +0000
1#!/usr/bin/env python3
3"""Module containing the InterSequenceCorrelation class and the command line interface."""
5from pathlib import Path
6from typing import Optional
8import matplotlib.pyplot as plt
9import numpy as np
10from biobb_common.generic.biobb_object import BiobbObject
11from biobb_common.tools.file_utils import launchlogger
13from biobb_dna.utils import constants
14from biobb_dna.utils.common import _from_string_to_list
15from biobb_dna.utils.loader import read_series
18class InterSequenceCorrelation(BiobbObject):
19 """
20 | biobb_dna InterSequenceCorrelation
21 | Calculate correlation between all base pairs of a single sequence and for a single helical parameter.
22 | Calculate correlation between all base pairs of a single sequence and for a single helical parameter.
24 Args:
25 input_ser_path (str): Path to .ser file with data for single helical parameter. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/correlation/canal_output_roll.ser>`_. Accepted formats: ser (edam:format_2330).
26 output_csv_path (str): Path to directory where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/inter_seqcorr_roll.csv>`_. Accepted formats: csv (edam:format_3752).
27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/inter_seqcorr_roll.jpg>`_. Accepted formats: jpg (edam:format_3579).
28 properties (dict):
29 * **sequence** (*str*) - (None) Nucleic acid sequence for the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).
30 * **helpar_name** (*str*) - (None) helical parameter name to add to plot title.
31 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.
32 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
33 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
34 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
36 Examples:
37 This is a use example of how to use the building block from Python::
39 from biobb_dna.interbp_correlations.interseqcorr import interseqcorr
41 prop = {
42 "helpar_name": "helpar",
43 "sequence": "CGTAATCG"
44 }
45 interseqcorr(
46 input_ser_path='path/to/input/file.ser',
47 output_csv_path='path/to/output/file.csv',
48 output_jpg_path='path/to/output/plot.jpg',
49 properties=prop)
50 Info:
51 * wrapped_software:
52 * name: In house
53 * license: Apache-2.0
54 * ontology:
55 * name: EDAM
56 * schema: http://edamontology.org/EDAM.owl
58 """
60 def __init__(
61 self,
62 input_ser_path,
63 output_csv_path,
64 output_jpg_path,
65 properties=None,
66 **kwargs,
67 ) -> None:
68 properties = properties or {}
70 # Call parent class constructor
71 super().__init__(properties)
72 self.locals_var_dict = locals().copy()
74 # Input/Output files
75 self.io_dict = {
76 "in": {"input_ser_path": input_ser_path},
77 "out": {
78 "output_csv_path": output_csv_path,
79 "output_jpg_path": output_jpg_path,
80 },
81 }
83 self.properties = properties
84 self.sequence = properties.get("sequence", None)
85 self.seqpos = [
86 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))
87 ]
88 self.helpar_name = properties.get("helpar_name", None)
90 # Check the properties
91 self.check_properties(properties)
92 self.check_arguments()
94 @launchlogger
95 def launch(self) -> int:
96 """Execute the :class:`HelParCorrelation <interbp_correlations.interseqcorr.InterSequenceCorrelation>` object."""
98 # Setup Biobb
99 if self.check_restart():
100 return 0
101 self.stage_files()
103 # check sequence
104 if self.sequence is None or len(self.sequence) < 2:
105 raise ValueError("sequence is null or too short!")
107 # get helical parameter from filename if not specified
108 if self.helpar_name is None:
109 for hp in constants.helical_parameters:
110 if (
111 hp.lower()
112 in Path(self.stage_io_dict["in"]["input_ser_path"]).name.lower()
113 ):
114 self.helpar_name = hp
115 if self.helpar_name is None:
116 raise ValueError(
117 "Helical parameter name can't be inferred from file, "
118 "so it must be specified!"
119 )
120 else:
121 if self.helpar_name not in constants.helical_parameters:
122 raise ValueError(
123 "Helical parameter name is invalid! "
124 f"Options: {constants.helical_parameters}"
125 )
127 # get base length and unit from helical parameter name
128 if self.helpar_name in constants.hp_angular:
129 self.method = "pearson"
130 else:
131 self.method = self.circular # type: ignore
133 # check seqpos
134 if self.seqpos:
135 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):
136 raise ValueError("seqpos must be a list of at least two integers")
137 else:
138 self.seqpos = None # type: ignore
140 # read input .ser file
141 ser_data = read_series(
142 self.stage_io_dict["in"]["input_ser_path"], usecols=self.seqpos
143 )
144 if not self.seqpos:
145 ser_data = ser_data[ser_data.columns[1:-1]]
146 # discard first and last base(pairs) from strands
147 sequence = self.sequence[1:]
148 labels = [f"{i+1}_{sequence[i:i+2]}" for i in range(len(ser_data.columns))]
149 else:
150 labels = [f"{i+1}_{self.sequence[i:i+2]}" for i in self.seqpos]
151 ser_data.columns = labels
153 # make matrix
154 corr_data = ser_data.corr(method=self.method)
156 # save csv data
157 corr_data.to_csv(self.stage_io_dict["out"]["output_csv_path"])
159 # create heatmap
160 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)
161 axs.pcolor(corr_data)
162 # Loop over data dimensions and create text annotations.
163 for i in range(len(corr_data)):
164 for j in range(len(corr_data)):
165 axs.text(
166 j + 0.5,
167 i + 0.5,
168 f"{corr_data[corr_data.columns[j]].iloc[i]:.2f}",
169 ha="center",
170 va="center",
171 color="w",
172 )
173 axs.set_xticks([i + 0.5 for i in range(len(corr_data))])
174 axs.set_xticklabels(labels, rotation=90)
175 axs.set_yticks([i + 0.5 for i in range(len(corr_data))])
176 axs.set_yticklabels(labels)
177 axs.set_title(
178 "Base Pair Correlation " f"for Helical Parameter '{self.helpar_name}'"
179 )
180 fig.tight_layout()
181 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")
182 plt.close()
184 # Copy files to host
185 self.copy_to_host()
187 # Remove temporary file(s)
188 self.remove_tmp_files()
190 self.check_arguments(output_files_created=True, raise_exception=False)
192 return 0
194 @staticmethod
195 def circular(x1, x2):
196 x1 = x1 * np.pi / 180
197 x2 = x2 * np.pi / 180
198 diff_1 = np.sin(x1 - x1.mean())
199 diff_2 = np.sin(x2 - x2.mean())
200 num = (diff_1 * diff_2).sum()
201 den = np.sqrt((diff_1**2).sum() * (diff_2**2).sum())
202 return num / den
205def interseqcorr(
206 input_ser_path: str,
207 output_csv_path: str,
208 output_jpg_path: str,
209 properties: Optional[dict] = None,
210 **kwargs,
211) -> int:
212 """Create :class:`HelParCorrelation <interbp_correlations.interseqcorr.InterSequenceCorrelation>` class and
213 execute the :meth:`launch() <interbp_correlations.interseqcorr.InterSequenceCorrelation.launch>` method."""
214 return InterSequenceCorrelation(**dict(locals())).launch()
217interseqcorr.__doc__ = InterSequenceCorrelation.__doc__
218main = InterSequenceCorrelation.get_main(interseqcorr, "Load .ser file from Canal output and calculate correlation between base pairs of the corresponding sequence.")
220if __name__ == '__main__':
221 main()