Coverage for biobb_dna/intrabp_correlations/intraseqcorr.py: 71%

96 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 10:36 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the IntraSequenceCorrelation class and the command line interface.""" 

4 

5import argparse 

6from pathlib import Path 

7from typing import Optional 

8 

9import matplotlib.pyplot as plt 

10import numpy as np 

11from biobb_common.configuration import settings 

12from biobb_common.generic.biobb_object import BiobbObject 

13from biobb_common.tools.file_utils import launchlogger 

14 

15from biobb_dna.utils import constants 

16from biobb_dna.utils.common import _from_string_to_list 

17from biobb_dna.utils.loader import read_series 

18 

19 

20class IntraSequenceCorrelation(BiobbObject): 

21 """ 

22 | biobb_dna IntraSequenceCorrelation 

23 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter. 

24 | Calculate correlation between all intra-base pairs of a single sequence and for a single helical parameter. 

25 

26 Args: 

27 input_ser_path (str): Path to .ser file with data for single helical parameter. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/correlation/canal_output_buckle.ser>`_. Accepted formats: ser (edam:format_2330). 

28 output_csv_path (str): Path to directory where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.csv>`_. Accepted formats: csv (edam:format_3752). 

29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/correlation/intra_seqcorr_buckle.jpg>`_. Accepted formats: jpg (edam:format_3579). 

30 properties (dict): 

31 * **sequence** (*str*) - (None) Nucleic acid sequence for the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

32 * **helpar_name** (*str*) - (None) helical parameter name to add to plot title. 

33 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

34 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

35 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

36 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

37 

38 Examples: 

39 This is a use example of how to use the building block from Python:: 

40 

41 from biobb_dna.intrabp_correlations.intraseqcorr import intraseqcorr 

42 

43 intraseqcorr( 

44 input_ser_path='path/to/input/file.ser', 

45 output_csv_path='path/to/output/file.csv', 

46 output_jpg_path='path/to/output/plot.jpg', 

47 properties=prop) 

48 Info: 

49 * wrapped_software: 

50 * name: In house 

51 * license: Apache-2.0 

52 * ontology: 

53 * name: EDAM 

54 * schema: http://edamontology.org/EDAM.owl 

55 

56 """ 

57 

58 def __init__( 

59 self, 

60 input_ser_path, 

61 output_csv_path, 

62 output_jpg_path, 

63 properties=None, 

64 **kwargs, 

65 ) -> None: 

66 properties = properties or {} 

67 

68 # Call parent class constructor 

69 super().__init__(properties) 

70 self.locals_var_dict = locals().copy() 

71 

72 # Input/Output files 

73 self.io_dict = { 

74 "in": {"input_ser_path": input_ser_path}, 

75 "out": { 

76 "output_csv_path": output_csv_path, 

77 "output_jpg_path": output_jpg_path, 

78 }, 

79 } 

80 

81 self.properties = properties 

82 self.sequence = properties.get("sequence", None) 

83 self.seqpos = [ 

84 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

85 ] 

86 self.helpar_name = properties.get("helpar_name", None) 

87 

88 # Check the properties 

89 self.check_properties(properties) 

90 self.check_arguments() 

91 

92 @launchlogger 

93 def launch(self) -> int: 

94 """Execute the :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` object.""" 

95 

96 # Setup Biobb 

97 if self.check_restart(): 

98 return 0 

99 self.stage_files() 

100 

101 # check sequence 

102 if self.sequence is None or len(self.sequence) < 2: 

103 raise ValueError("sequence is null or too short!") 

104 

105 # get helical parameter from filename if not specified 

106 if self.helpar_name is None: 

107 for hp in constants.helical_parameters: 

108 if ( 

109 hp.lower() 

110 in Path(self.stage_io_dict["in"]["input_ser_path"]).name.lower() 

111 ): 

112 self.helpar_name = hp 

113 if self.helpar_name is None: 

114 raise ValueError( 

115 "Helical parameter name can't be inferred from file, " 

116 "so it must be specified!" 

117 ) 

118 else: 

119 if self.helpar_name not in constants.helical_parameters: 

120 raise ValueError( 

121 "Helical parameter name is invalid! " 

122 f"Options: {constants.helical_parameters}" 

123 ) 

124 

125 # get base length and unit from helical parameter name 

126 if self.helpar_name in constants.hp_angular: 

127 self.method = "pearson" 

128 else: 

129 self.method = self.circular # type: ignore 

130 

131 # check seqpos 

132 if self.seqpos: 

133 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

134 raise ValueError("seqpos must be a list of at least two integers") 

135 else: 

136 self.seqpos = None # type: ignore 

137 

138 # read input .ser file 

139 ser_data = read_series( 

140 self.stage_io_dict["in"]["input_ser_path"], usecols=self.seqpos 

141 ) 

142 if not self.seqpos: 

143 ser_data = ser_data[ser_data.columns[1:-1]] 

144 # discard first and last base(pairs) from strands 

145 sequence = self.sequence[1:] 

146 labels = [f"{i+1}_{sequence[i:i+1]}" for i in range(len(ser_data.columns))] 

147 else: 

148 labels = [f"{i+1}_{self.sequence[i:i+1]}" for i in self.seqpos] 

149 ser_data.columns = labels 

150 

151 # make matrix 

152 corr_data = ser_data.corr(method=self.method) 

153 

154 # save csv data 

155 corr_data.to_csv(self.stage_io_dict["out"]["output_csv_path"]) 

156 

157 # create heatmap 

158 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

159 axs.pcolor(corr_data) 

160 # Loop over data dimensions and create text annotations. 

161 for i in range(len(corr_data)): 

162 for j in range(len(corr_data)): 

163 axs.text( 

164 j + 0.5, 

165 i + 0.5, 

166 f"{corr_data[corr_data.columns[j]].iloc[i]:.2f}", 

167 ha="center", 

168 va="center", 

169 color="w", 

170 ) 

171 axs.set_xticks([i + 0.5 for i in range(len(corr_data))]) 

172 axs.set_xticklabels(labels, rotation=90) 

173 axs.set_yticks([i + 0.5 for i in range(len(corr_data))]) 

174 axs.set_yticklabels(labels) 

175 axs.set_title( 

176 "Base Pair Correlation " f"for Helical Parameter '{self.helpar_name}'" 

177 ) 

178 fig.tight_layout() 

179 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

180 plt.close() 

181 

182 # Copy files to host 

183 self.copy_to_host() 

184 

185 # Remove temporary file(s) 

186 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

187 self.remove_tmp_files() 

188 

189 self.check_arguments(output_files_created=True, raise_exception=False) 

190 

191 return 0 

192 

193 @staticmethod 

194 def circular(x1, x2): 

195 x1 = x1 * np.pi / 180 

196 x2 = x2 * np.pi / 180 

197 diff_1 = np.sin(x1 - x1.mean()) 

198 diff_2 = np.sin(x2 - x2.mean()) 

199 num = (diff_1 * diff_2).sum() 

200 den = np.sqrt((diff_1**2).sum() * (diff_2**2).sum()) 

201 return num / den 

202 

203 

204def intraseqcorr( 

205 input_ser_path: str, 

206 output_csv_path: str, 

207 output_jpg_path: str, 

208 properties: Optional[dict] = None, 

209 **kwargs, 

210) -> int: 

211 """Create :class:`HelParCorrelation <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation>` class and 

212 execute the :meth:`launch() <intrabp_correlations.intraseqcorr.IntraSequenceCorrelation.launch>` method.""" 

213 

214 return IntraSequenceCorrelation( 

215 input_ser_path=input_ser_path, 

216 output_csv_path=output_csv_path, 

217 output_jpg_path=output_jpg_path, 

218 properties=properties, 

219 **kwargs, 

220 ).launch() 

221 

222 intraseqcorr.__doc__ = IntraSequenceCorrelation.__doc__ 

223 

224 

225def main(): 

226 """Command line execution of this building block. Please check the command line documentation.""" 

227 parser = argparse.ArgumentParser( 

228 description="Load .ser file from Canal output and calculate correlation between base pairs of the corresponding sequence.", 

229 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

230 ) 

231 parser.add_argument("--config", required=False, help="Configuration file") 

232 

233 required_args = parser.add_argument_group("required arguments") 

234 required_args.add_argument( 

235 "--input_ser_path", 

236 required=True, 

237 help="Path to ser file with inputs. Accepted formats: ser.", 

238 ) 

239 required_args.add_argument( 

240 "--output_csv_path", 

241 required=True, 

242 help="Path to output file. Accepted formats: csv.", 

243 ) 

244 required_args.add_argument( 

245 "--output_jpg_path", 

246 required=True, 

247 help="Path to output plot. Accepted formats: jpg.", 

248 ) 

249 

250 args = parser.parse_args() 

251 args.config = args.config or "{}" 

252 properties = settings.ConfReader(config=args.config).get_prop_dic() 

253 

254 intraseqcorr( 

255 input_ser_path=args.input_ser_path, 

256 output_csv_path=args.output_csv_path, 

257 output_jpg_path=args.output_jpg_path, 

258 properties=properties, 

259 ) 

260 

261 

262if __name__ == "__main__": 

263 main()