Coverage for biobb_dna/backbone/canonicalag.py: 79%

110 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 10:36 +0000

1#!/usr/bin/env python3 

2"""Module containing the CanonicalAG class and the command line interface.""" 

3 

4import argparse 

5from typing import Optional 

6 

7import matplotlib.pyplot as plt 

8import numpy as np 

9import pandas as pd 

10from biobb_common.configuration import settings 

11from biobb_common.generic.biobb_object import BiobbObject 

12from biobb_common.tools.file_utils import launchlogger 

13 

14from biobb_dna.utils.common import _from_string_to_list 

15from biobb_dna.utils.loader import read_series 

16from biobb_dna.utils.transform import inverse_complement 

17 

18 

19class CanonicalAG(BiobbObject): 

20 """ 

21 | biobb_dna CanonicalAG 

22 | Calculate Canonical Alpha/Gamma populations from alpha and gamma parameters. 

23 | Calculate Canonical Alpha/Gamma populations from alpha and gamma parameters. 

24 

25 Args: 

26 input_alphaC_path (str): Path to .ser file for helical parameter 'alphaC'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_alphaC.ser>`_. Accepted formats: ser (edam:format_2330). 

27 input_alphaW_path (str): Path to .ser file for helical parameter 'alphaW'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_alphaW.ser>`_. Accepted formats: ser (edam:format_2330). 

28 input_gammaC_path (str): Path to .ser file for helical parameter 'gammaC'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_gammaC.ser>`_. Accepted formats: ser (edam:format_2330). 

29 input_gammaW_path (str): Path to .ser file for helical parameter 'gammaW'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_gammaW.ser>`_. Accepted formats: ser (edam:format_2330). 

30 output_csv_path (str): Path to .csv file where output is saved. File type: output. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/canonag_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/canonag_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

32 properties (dict): 

33 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

34 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

35 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

36 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

37 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

38 

39 Examples: 

40 This is a use example of how to use the building block from Python:: 

41 

42 from biobb_dna.backbone.canonicalag import canonicalag 

43 

44 prop = { 

45 'seqpos': [1,2], 

46 'sequence': 'GCAT', 

47 } 

48 canonicalag( 

49 input_alphaC_path='/path/to/alphaC.ser', 

50 input_alphaW_path='/path/to/alphaW.ser', 

51 input_gammaC_path='/path/to/gammaC.ser', 

52 input_gammaW_path='/path/to/gammaW.ser', 

53 output_csv_path='/path/to/table/output.csv', 

54 output_jpg_path='/path/to/table/output.jpg', 

55 properties=prop) 

56 Info: 

57 * wrapped_software: 

58 * name: In house 

59 * license: Apache-2.0 

60 * ontology: 

61 * name: EDAM 

62 * schema: http://edamontology.org/EDAM.owl 

63 

64 """ 

65 

66 def __init__( 

67 self, 

68 input_alphaC_path, 

69 input_alphaW_path, 

70 input_gammaC_path, 

71 input_gammaW_path, 

72 output_csv_path, 

73 output_jpg_path, 

74 properties=None, 

75 **kwargs, 

76 ) -> None: 

77 properties = properties or {} 

78 

79 # Call parent class constructor 

80 super().__init__(properties) 

81 self.locals_var_dict = locals().copy() 

82 

83 # Input/Output files 

84 self.io_dict = { 

85 "in": { 

86 "input_alphaC_path": input_alphaC_path, 

87 "input_alphaW_path": input_alphaW_path, 

88 "input_gammaC_path": input_gammaC_path, 

89 "input_gammaW_path": input_gammaW_path, 

90 }, 

91 "out": { 

92 "output_csv_path": output_csv_path, 

93 "output_jpg_path": output_jpg_path, 

94 }, 

95 } 

96 

97 self.properties = properties 

98 self.sequence = properties.get("sequence") 

99 self.stride = properties.get("stride", 1000) 

100 self.seqpos = [ 

101 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

102 ] 

103 

104 # Check the properties 

105 self.check_properties(properties) 

106 self.check_arguments() 

107 

108 @launchlogger 

109 def launch(self) -> int: 

110 """Execute the :class:`CanonicalAG <backbone.canonicalag.CanonicalAG>` object.""" 

111 

112 # Setup Biobb 

113 if self.check_restart(): 

114 return 0 

115 self.stage_files() 

116 

117 # check sequence 

118 if self.sequence is None or len(self.sequence) < 2: 

119 raise ValueError("sequence is null or too short!") 

120 

121 # check seqpos 

122 if self.seqpos: 

123 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

124 raise ValueError( 

125 f"seqpos values must be between 1 and {len(self.sequence) - 2}" 

126 ) 

127 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

128 raise ValueError("seqpos must be a list of at least two integers") 

129 else: 

130 self.seqpos = None # type: ignore 

131 

132 # read input files 

133 alphaC = read_series( 

134 self.stage_io_dict["in"]["input_alphaC_path"], usecols=self.seqpos 

135 ) 

136 alphaW = read_series( 

137 self.stage_io_dict["in"]["input_alphaW_path"], usecols=self.seqpos 

138 ) 

139 gammaC = read_series( 

140 self.stage_io_dict["in"]["input_gammaC_path"], usecols=self.seqpos 

141 ) 

142 gammaW = read_series( 

143 self.stage_io_dict["in"]["input_gammaW_path"], usecols=self.seqpos 

144 ) 

145 

146 # fix angle range so its not negative 

147 alphaC = self.fix_angles(alphaC) 

148 alphaW = self.fix_angles(alphaW) 

149 gammaC = self.fix_angles(gammaC) 

150 gammaW = self.fix_angles(gammaW) 

151 

152 # calculate difference between epsil and zeta parameters 

153 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence)) 

154 canonical_populations = self.check_alpha_gamma(alphaC, gammaC, alphaW, gammaW) 

155 

156 # save table 

157 canonical_populations.name = "Canonical alpha/gamma" 

158 ag_populations_df = pd.DataFrame( 

159 {"Nucleotide": xlabels, "Canonical alpha/gamma": canonical_populations} 

160 ) 

161 ag_populations_df.to_csv( 

162 self.stage_io_dict["out"]["output_csv_path"], index=False 

163 ) 

164 

165 # save plot 

166 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

167 axs.bar( 

168 range(len(xlabels)), canonical_populations, label="canonical alpha/gamma" 

169 ) 

170 axs.bar( 

171 range(len(xlabels)), 

172 100 - canonical_populations, 

173 bottom=canonical_populations, 

174 label=None, 

175 ) 

176 # empty bar to divide both sequences 

177 axs.bar([len(alphaC.columns)], [100], color="white", label=None) 

178 axs.legend() 

179 axs.set_xticks(range(len(xlabels))) 

180 axs.set_xticklabels(xlabels, rotation=90) 

181 axs.set_xlabel("Nucleotide Sequence") 

182 axs.set_ylabel("Canonical Alpha-Gamma (%)") 

183 axs.set_title("Nucleotide parameter: Canonical Alpha-Gamma") 

184 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

185 plt.close() 

186 

187 # Copy files to host 

188 self.copy_to_host() 

189 

190 # Remove temporary file(s) 

191 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

192 self.remove_tmp_files() 

193 

194 return 0 

195 

196 def get_xlabels(self, strand1, strand2): 

197 # get list of tetramers, except first and last two bases 

198 labelsW = list(strand1) 

199 labelsW[0] = f"{labelsW[0]}5'" 

200 labelsW[-1] = f"{labelsW[-1]}3'" 

201 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))] 

202 labelsC = list(strand2)[::-1] 

203 labelsC[0] = f"{labelsC[0]}5'" 

204 labelsC[-1] = f"{labelsC[-1]}3'" 

205 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

206 

207 if self.seqpos: 

208 labelsC = [labelsC[i] for i in self.seqpos] 

209 labelsW = [labelsW[i] for i in self.seqpos] 

210 xlabels = labelsW + ["-"] + labelsC 

211 return xlabels 

212 

213 def check_alpha_gamma(self, alphaC, gammaC, alphaW, gammaW): 

214 separator_df = pd.DataFrame({"-": np.nan}, index=range(len(gammaW))) 

215 gamma = pd.concat([gammaW, separator_df, gammaC[gammaC.columns[::-1]]], axis=1) 

216 alpha = pd.concat([alphaW, separator_df, alphaC[alphaC.columns[::-1]]], axis=1) 

217 

218 alpha_filter = np.logical_and(alpha > 240, alpha < 360) 

219 gamma_filter = np.logical_and(gamma > 0, gamma < 120) 

220 canonical_alpha_gamma = np.logical_and(alpha_filter, gamma_filter).mean() * 100 

221 

222 return canonical_alpha_gamma 

223 

224 def fix_angles(self, dataset): 

225 values = np.where(dataset < 0, dataset + 360, dataset) 

226 values = np.where(values > 360, values - 360, values) 

227 dataset = pd.DataFrame(values) 

228 return dataset 

229 

230 

231def canonicalag( 

232 input_alphaC_path: str, 

233 input_alphaW_path: str, 

234 input_gammaC_path: str, 

235 input_gammaW_path: str, 

236 output_csv_path: str, 

237 output_jpg_path: str, 

238 properties: Optional[dict] = None, 

239 **kwargs, 

240) -> int: 

241 """Create :class:`CanonicalAG <dna.backbone.canonicalag.CanonicalAG>` class and 

242 execute the: meth: `launch() <dna.backbone.canonicalag.CanonicalAG.launch>` method.""" 

243 

244 return CanonicalAG( 

245 input_alphaC_path=input_alphaC_path, 

246 input_alphaW_path=input_alphaW_path, 

247 input_gammaC_path=input_gammaC_path, 

248 input_gammaW_path=input_gammaW_path, 

249 output_csv_path=output_csv_path, 

250 output_jpg_path=output_jpg_path, 

251 properties=properties, 

252 **kwargs, 

253 ).launch() 

254 

255 canonicalag.__doc__ = CanonicalAG.__doc__ 

256 

257 

258def main(): 

259 """Command line execution of this building block. Please check the command line documentation.""" 

260 parser = argparse.ArgumentParser( 

261 description="Calculate Canonical Alpha/Gamma distributions.", 

262 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

263 ) 

264 parser.add_argument("--config", required=False, help="Configuration file") 

265 

266 required_args = parser.add_argument_group("required arguments") 

267 required_args.add_argument( 

268 "--input_alphaC_path", 

269 required=True, 

270 help="Helical parameter <alphaC> input ser file path. Accepted formats: ser.", 

271 ) 

272 required_args.add_argument( 

273 "--input_alphaW_path", 

274 required=True, 

275 help="Helical parameter <alphaW> input ser file path. Accepted formats: ser.", 

276 ) 

277 required_args.add_argument( 

278 "--input_gammaC_path", 

279 required=True, 

280 help="Helical parameter <gammaC> input ser file path. Accepted formats: ser.", 

281 ) 

282 required_args.add_argument( 

283 "--input_gammaW_path", 

284 required=True, 

285 help="Helical parameter <gammaW> input ser file path. Accepted formats: ser.", 

286 ) 

287 required_args.add_argument( 

288 "--output_csv_path", 

289 required=True, 

290 help="Path to output csv file. Accepted formats: csv.", 

291 ) 

292 required_args.add_argument( 

293 "--output_jpg_path", 

294 required=True, 

295 help="Path to output jpg file. Accepted formats: jpg.", 

296 ) 

297 

298 args = parser.parse_args() 

299 args.config = args.config or "{}" 

300 properties = settings.ConfReader(config=args.config).get_prop_dic() 

301 

302 canonicalag( 

303 input_alphaC_path=args.input_alphaC_path, 

304 input_alphaW_path=args.input_alphaW_path, 

305 input_gammaC_path=args.input_gammaC_path, 

306 input_gammaW_path=args.input_gammaW_path, 

307 output_csv_path=args.output_csv_path, 

308 output_jpg_path=args.output_jpg_path, 

309 properties=properties, 

310 ) 

311 

312 

313if __name__ == "__main__": 

314 main()