Coverage for biobb_dna / intrabp_correlations / intraseqcorr.py: 80%

84 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-15 18:49 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the IntraSequenceCorrelation class and the command line interface.""" 

4 

5from pathlib import Path 

6from typing import Optional 

7 

8import matplotlib.pyplot as plt 

9import numpy as np 

10from biobb_common.generic.biobb_object import BiobbObject 

11from biobb_common.tools.file_utils import launchlogger 

12 

13from biobb_dna.utils import constants 

14from biobb_dna.utils.common import _from_string_to_list 

15from biobb_dna.utils.loader import read_series 

16 

17 

18class IntraSequenceCorrelation(BiobbObject): 

19 """ 

20 | biobb_dna IntraSequenceCorrelation 

21 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter. 

22 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter. 

23 

24 Args: 

25 input_ser_path (str): Path to .ser file with data for single helical parameter. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/correlation/canal_output_buckle.ser>`_. Accepted formats: ser (edam:format_2330). 

26 output_csv_path (str): Path to directory where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.csv>`_. Accepted formats: csv (edam:format_3752). 

27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.jpg>`_. Accepted formats: jpg (edam:format_3579). 

28 properties (dict): 

29 * **sequence** (*str*) - (None) Nucleic acid sequence for the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

30 * **helpar_name** (*str*) - (None) helical parameter name to add to plot title. 

31 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

32 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

33 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

34 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

35 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.intrabp_correlations.intraseqcorr import intraseqcorr 

40 

41 intraseqcorr( 

42 input_ser_path='path/to/input/file.ser', 

43 output_csv_path='path/to/output/file.csv', 

44 output_jpg_path='path/to/output/plot.jpg', 

45 properties=prop) 

46 Info: 

47 * wrapped_software: 

48 * name: In house 

49 * license: Apache-2.0 

50 * ontology: 

51 * name: EDAM 

52 * schema: http://edamontology.org/EDAM.owl 

53 

54 """ 

55 

56 def __init__( 

57 self, 

58 input_ser_path, 

59 output_csv_path, 

60 output_jpg_path, 

61 properties=None, 

62 **kwargs, 

63 ) -> None: 

64 properties = properties or {} 

65 

66 # Call parent class constructor 

67 super().__init__(properties) 

68 self.locals_var_dict = locals().copy() 

69 

70 # Input/Output files 

71 self.io_dict = { 

72 "in": {"input_ser_path": input_ser_path}, 

73 "out": { 

74 "output_csv_path": output_csv_path, 

75 "output_jpg_path": output_jpg_path, 

76 }, 

77 } 

78 

79 self.properties = properties 

80 self.sequence = properties.get("sequence", None) 

81 self.seqpos = [ 

82 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

83 ] 

84 self.helpar_name = properties.get("helpar_name", None) 

85 

86 # Check the properties 

87 self.check_properties(properties) 

88 self.check_arguments() 

89 

90 @launchlogger 

91 def launch(self) -> int: 

92 """Execute the :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` object.""" 

93 

94 # Setup Biobb 

95 if self.check_restart(): 

96 return 0 

97 self.stage_files() 

98 

99 # check sequence 

100 if self.sequence is None or len(self.sequence) < 2: 

101 raise ValueError("sequence is null or too short!") 

102 

103 # get helical parameter from filename if not specified 

104 if self.helpar_name is None: 

105 for hp in constants.helical_parameters: 

106 if ( 

107 hp.lower() 

108 in Path(self.stage_io_dict["in"]["input_ser_path"]).name.lower() 

109 ): 

110 self.helpar_name = hp 

111 if self.helpar_name is None: 

112 raise ValueError( 

113 "Helical parameter name can't be inferred from file, " 

114 "so it must be specified!" 

115 ) 

116 else: 

117 if self.helpar_name not in constants.helical_parameters: 

118 raise ValueError( 

119 "Helical parameter name is invalid! " 

120 f"Options: {constants.helical_parameters}" 

121 ) 

122 

123 # get base length and unit from helical parameter name 

124 if self.helpar_name in constants.hp_angular: 

125 self.method = "pearson" 

126 else: 

127 self.method = self.circular # type: ignore 

128 

129 # check seqpos 

130 if self.seqpos: 

131 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

132 raise ValueError("seqpos must be a list of at least two integers") 

133 else: 

134 self.seqpos = None # type: ignore 

135 

136 # read input .ser file 

137 ser_data = read_series( 

138 self.stage_io_dict["in"]["input_ser_path"], usecols=self.seqpos 

139 ) 

140 if not self.seqpos: 

141 ser_data = ser_data[ser_data.columns[1:-1]] 

142 # discard first and last base(pairs) from strands 

143 sequence = self.sequence[1:] 

144 labels = [f"{i+1}_{sequence[i:i+1]}" for i in range(len(ser_data.columns))] 

145 else: 

146 labels = [f"{i+1}_{self.sequence[i:i+1]}" for i in self.seqpos] 

147 ser_data.columns = labels 

148 

149 # make matrix 

150 corr_data = ser_data.corr(method=self.method) 

151 

152 # save csv data 

153 corr_data.to_csv(self.stage_io_dict["out"]["output_csv_path"]) 

154 

155 # create heatmap 

156 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

157 axs.pcolor(corr_data) 

158 # Loop over data dimensions and create text annotations. 

159 for i in range(len(corr_data)): 

160 for j in range(len(corr_data)): 

161 axs.text( 

162 j + 0.5, 

163 i + 0.5, 

164 f"{corr_data[corr_data.columns[j]].iloc[i]:.2f}", 

165 ha="center", 

166 va="center", 

167 color="w", 

168 ) 

169 axs.set_xticks([i + 0.5 for i in range(len(corr_data))]) 

170 axs.set_xticklabels(labels, rotation=90) 

171 axs.set_yticks([i + 0.5 for i in range(len(corr_data))]) 

172 axs.set_yticklabels(labels) 

173 axs.set_title( 

174 "Base Pair Correlation " f"for Helical Parameter '{self.helpar_name}'" 

175 ) 

176 fig.tight_layout() 

177 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

178 plt.close() 

179 

180 # Copy files to host 

181 self.copy_to_host() 

182 

183 # Remove temporary file(s) 

184 self.remove_tmp_files() 

185 

186 self.check_arguments(output_files_created=True, raise_exception=False) 

187 

188 return 0 

189 

190 @staticmethod 

191 def circular(x1, x2): 

192 x1 = x1 * np.pi / 180 

193 x2 = x2 * np.pi / 180 

194 diff_1 = np.sin(x1 - x1.mean()) 

195 diff_2 = np.sin(x2 - x2.mean()) 

196 num = (diff_1 * diff_2).sum() 

197 den = np.sqrt((diff_1**2).sum() * (diff_2**2).sum()) 

198 return num / den 

199 

200 

201def intraseqcorr( 

202 input_ser_path: str, 

203 output_csv_path: str, 

204 output_jpg_path: str, 

205 properties: Optional[dict] = None, 

206 **kwargs, 

207) -> int: 

208 """Create :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` class and 

209 execute the :meth:`launch() <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation.launch>` method.""" 

210 return IntraSequenceCorrelation(**dict(locals())).launch() 

211 

212 

213intraseqcorr.__doc__ = IntraSequenceCorrelation.__doc__ 

214main = IntraSequenceCorrelation.get_main(intraseqcorr, "Load .ser file from Canal output and calculate correlation between base pairs of the corresponding sequence.") 

215 

216if __name__ == '__main__': 

217 main()