Coverage for biobb_dna / interbp_correlations / interseqcorr.py: 80%

84 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-15 18:49 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the InterSequenceCorrelation class and the command line interface.""" 

4 

5from pathlib import Path 

6from typing import Optional 

7 

8import matplotlib.pyplot as plt 

9import numpy as np 

10from biobb_common.generic.biobb_object import BiobbObject 

11from biobb_common.tools.file_utils import launchlogger 

12 

13from biobb_dna.utils import constants 

14from biobb_dna.utils.common import _from_string_to_list 

15from biobb_dna.utils.loader import read_series 

16 

17 

18class InterSequenceCorrelation(BiobbObject): 

19 """ 

20 | biobb_dna InterSequenceCorrelation 

21 | Calculate correlation between all base pairs of a single sequence and for a single helical parameter. 

22 | Calculate correlation between all base pairs of a single sequence and for a single helical parameter. 

23 

24 Args: 

25 input_ser_path (str): Path to .ser file with data for single helical parameter. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/correlation/canal_output_roll.ser>`_. Accepted formats: ser (edam:format_2330). 

26 output_csv_path (str): Path to directory where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/inter_seqcorr_roll.csv>`_. Accepted formats: csv (edam:format_3752). 

27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/inter_seqcorr_roll.jpg>`_. Accepted formats: jpg (edam:format_3579). 

28 properties (dict): 

29 * **sequence** (*str*) - (None) Nucleic acid sequence for the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

30 * **helpar_name** (*str*) - (None) helical parameter name to add to plot title. 

31 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

32 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

33 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

34 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

35 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.interbp_correlations.interseqcorr import interseqcorr 

40 

41 prop = { 

42 "helpar_name": "helpar", 

43 "sequence": "CGTAATCG" 

44 } 

45 interseqcorr( 

46 input_ser_path='path/to/input/file.ser', 

47 output_csv_path='path/to/output/file.csv', 

48 output_jpg_path='path/to/output/plot.jpg', 

49 properties=prop) 

50 Info: 

51 * wrapped_software: 

52 * name: In house 

53 * license: Apache-2.0 

54 * ontology: 

55 * name: EDAM 

56 * schema: http://edamontology.org/EDAM.owl 

57 

58 """ 

59 

60 def __init__( 

61 self, 

62 input_ser_path, 

63 output_csv_path, 

64 output_jpg_path, 

65 properties=None, 

66 **kwargs, 

67 ) -> None: 

68 properties = properties or {} 

69 

70 # Call parent class constructor 

71 super().__init__(properties) 

72 self.locals_var_dict = locals().copy() 

73 

74 # Input/Output files 

75 self.io_dict = { 

76 "in": {"input_ser_path": input_ser_path}, 

77 "out": { 

78 "output_csv_path": output_csv_path, 

79 "output_jpg_path": output_jpg_path, 

80 }, 

81 } 

82 

83 self.properties = properties 

84 self.sequence = properties.get("sequence", None) 

85 self.seqpos = [ 

86 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

87 ] 

88 self.helpar_name = properties.get("helpar_name", None) 

89 

90 # Check the properties 

91 self.check_properties(properties) 

92 self.check_arguments() 

93 

94 @launchlogger 

95 def launch(self) -> int: 

96 """Execute the :class:`HelParCorrelation <interbp_correlations.interseqcorr.InterSequenceCorrelation>` object.""" 

97 

98 # Setup Biobb 

99 if self.check_restart(): 

100 return 0 

101 self.stage_files() 

102 

103 # check sequence 

104 if self.sequence is None or len(self.sequence) < 2: 

105 raise ValueError("sequence is null or too short!") 

106 

107 # get helical parameter from filename if not specified 

108 if self.helpar_name is None: 

109 for hp in constants.helical_parameters: 

110 if ( 

111 hp.lower() 

112 in Path(self.stage_io_dict["in"]["input_ser_path"]).name.lower() 

113 ): 

114 self.helpar_name = hp 

115 if self.helpar_name is None: 

116 raise ValueError( 

117 "Helical parameter name can't be inferred from file, " 

118 "so it must be specified!" 

119 ) 

120 else: 

121 if self.helpar_name not in constants.helical_parameters: 

122 raise ValueError( 

123 "Helical parameter name is invalid! " 

124 f"Options: {constants.helical_parameters}" 

125 ) 

126 

127 # get base length and unit from helical parameter name 

128 if self.helpar_name in constants.hp_angular: 

129 self.method = "pearson" 

130 else: 

131 self.method = self.circular # type: ignore 

132 

133 # check seqpos 

134 if self.seqpos: 

135 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

136 raise ValueError("seqpos must be a list of at least two integers") 

137 else: 

138 self.seqpos = None # type: ignore 

139 

140 # read input .ser file 

141 ser_data = read_series( 

142 self.stage_io_dict["in"]["input_ser_path"], usecols=self.seqpos 

143 ) 

144 if not self.seqpos: 

145 ser_data = ser_data[ser_data.columns[1:-1]] 

146 # discard first and last base(pairs) from strands 

147 sequence = self.sequence[1:] 

148 labels = [f"{i+1}_{sequence[i:i+2]}" for i in range(len(ser_data.columns))] 

149 else: 

150 labels = [f"{i+1}_{self.sequence[i:i+2]}" for i in self.seqpos] 

151 ser_data.columns = labels 

152 

153 # make matrix 

154 corr_data = ser_data.corr(method=self.method) 

155 

156 # save csv data 

157 corr_data.to_csv(self.stage_io_dict["out"]["output_csv_path"]) 

158 

159 # create heatmap 

160 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

161 axs.pcolor(corr_data) 

162 # Loop over data dimensions and create text annotations. 

163 for i in range(len(corr_data)): 

164 for j in range(len(corr_data)): 

165 axs.text( 

166 j + 0.5, 

167 i + 0.5, 

168 f"{corr_data[corr_data.columns[j]].iloc[i]:.2f}", 

169 ha="center", 

170 va="center", 

171 color="w", 

172 ) 

173 axs.set_xticks([i + 0.5 for i in range(len(corr_data))]) 

174 axs.set_xticklabels(labels, rotation=90) 

175 axs.set_yticks([i + 0.5 for i in range(len(corr_data))]) 

176 axs.set_yticklabels(labels) 

177 axs.set_title( 

178 "Base Pair Correlation " f"for Helical Parameter '{self.helpar_name}'" 

179 ) 

180 fig.tight_layout() 

181 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

182 plt.close() 

183 

184 # Copy files to host 

185 self.copy_to_host() 

186 

187 # Remove temporary file(s) 

188 self.remove_tmp_files() 

189 

190 self.check_arguments(output_files_created=True, raise_exception=False) 

191 

192 return 0 

193 

194 @staticmethod 

195 def circular(x1, x2): 

196 x1 = x1 * np.pi / 180 

197 x2 = x2 * np.pi / 180 

198 diff_1 = np.sin(x1 - x1.mean()) 

199 diff_2 = np.sin(x2 - x2.mean()) 

200 num = (diff_1 * diff_2).sum() 

201 den = np.sqrt((diff_1**2).sum() * (diff_2**2).sum()) 

202 return num / den 

203 

204 

205def interseqcorr( 

206 input_ser_path: str, 

207 output_csv_path: str, 

208 output_jpg_path: str, 

209 properties: Optional[dict] = None, 

210 **kwargs, 

211) -> int: 

212 """Create :class:`HelParCorrelation <interbp_correlations.interseqcorr.InterSequenceCorrelation>` class and 

213 execute the :meth:`launch() <interbp_correlations.interseqcorr.InterSequenceCorrelation.launch>` method.""" 

214 return InterSequenceCorrelation(**dict(locals())).launch() 

215 

216 

217interseqcorr.__doc__ = InterSequenceCorrelation.__doc__ 

218main = InterSequenceCorrelation.get_main(interseqcorr, "Load .ser file from Canal output and calculate correlation between base pairs of the corresponding sequence.") 

219 

220if __name__ == '__main__': 

221 main()