Coverage for biobb_dna/backbone/canonicalag.py: 81%

115 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:06 +0000

1#!/usr/bin/env python3 

2"""Module containing the CanonicalAG class and the command line interface.""" 

3 

4import shutil 

5import argparse 

6 

7import matplotlib.pyplot as plt 

8import pandas as pd 

9import numpy as np 

10from biobb_dna.utils.loader import read_series 

11from biobb_dna.utils.transform import inverse_complement 

12from biobb_common.generic.biobb_object import BiobbObject 

13from biobb_common.tools.file_utils import launchlogger 

14from biobb_common.tools import file_utils as fu 

15from biobb_common.configuration import settings 

16 

17 

18class CanonicalAG(BiobbObject): 

19 """ 

20 | biobb_dna CanonicalAG 

21 | Calculate Canonical Alpha/Gamma populations from alpha and gamma parameters. 

22 

23 Args: 

24 input_alphaC_path (str): Path to .ser file for helical parameter 'alphaC'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_alphaC.ser>`_. Accepted formats: ser (edam:format_2330). 

25 input_alphaW_path (str): Path to .ser file for helical parameter 'alphaW'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_alphaW.ser>`_. Accepted formats: ser (edam:format_2330). 

26 input_gammaC_path (str): Path to .ser file for helical parameter 'gammaC'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_gammaC.ser>`_. Accepted formats: ser (edam:format_2330). 

27 input_gammaW_path (str): Path to .ser file for helical parameter 'gammaW'. File type: input. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_gammaW.ser>`_. Accepted formats: ser (edam:format_2330). 

28 output_csv_path (str): Path to .csv file where output is saved. File type: output. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/canonag_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/canonag_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

30 properties (dict): 

31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

35 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.backbone.canonicalag import canonicalag 

40 

41 prop = { 

42 'helpar_name': 'twist', 

43 'seqpos': [1,2], 

44 'sequence': 'GCAT', 

45 } 

46 canonicalag( 

47 input_alphaC_path='/path/to/alphaC.ser', 

48 input_alphaW_path='/path/to/alphaW.ser', 

49 input_gammaC_path='/path/to/gammaC.ser', 

50 input_gammaW_path='/path/to/gammaW.ser', 

51 output_csv_path='/path/to/table/output.csv', 

52 output_jpg_path='/path/to/table/output.jpg', 

53 properties=prop) 

54 Info: 

55 * wrapped_software: 

56 * name: In house 

57 * license: Apache-2.0 

58 * ontology: 

59 * name: EDAM 

60 * schema: http://edamontology.org/EDAM.owl 

61 

62 """ 

63 

64 def __init__(self, input_alphaC_path, input_alphaW_path, 

65 input_gammaC_path, input_gammaW_path, 

66 output_csv_path, output_jpg_path, 

67 properties=None, **kwargs) -> None: 

68 properties = properties or {} 

69 

70 # Call parent class constructor 

71 super().__init__(properties) 

72 self.locals_var_dict = locals().copy() 

73 

74 # Input/Output files 

75 self.io_dict = { 

76 'in': { 

77 'input_alphaC_path': input_alphaC_path, 

78 'input_alphaW_path': input_alphaW_path, 

79 'input_gammaC_path': input_gammaC_path, 

80 'input_gammaW_path': input_gammaW_path 

81 }, 

82 'out': { 

83 'output_csv_path': output_csv_path, 

84 'output_jpg_path': output_jpg_path 

85 } 

86 } 

87 

88 self.properties = properties 

89 self.sequence = properties.get("sequence") 

90 self.stride = properties.get( 

91 "stride", 1000) 

92 self.seqpos = properties.get( 

93 "seqpos", None) 

94 

95 # Check the properties 

96 self.check_properties(properties) 

97 self.check_arguments() 

98 

99 @launchlogger 

100 def launch(self) -> int: 

101 """Execute the :class:`CanonicalAG <backbone.canonicalag.CanonicalAG>` object.""" 

102 

103 # Setup Biobb 

104 if self.check_restart(): 

105 return 0 

106 self.stage_files() 

107 

108 # check sequence 

109 if self.sequence is None or len(self.sequence) < 2: 

110 raise ValueError("sequence is null or too short!") 

111 

112 # check seqpos 

113 if self.seqpos is not None: 

114 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

115 raise ValueError( 

116 f"seqpos values must be between 1 and {len(self.sequence) - 2}") 

117 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

118 raise ValueError( 

119 "seqpos must be a list of at least two integers") 

120 

121 # Creating temporary folder 

122 self.tmp_folder = fu.create_unique_dir(prefix="backbone_") 

123 fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log) 

124 

125 # Copy input_file_path1 to temporary folder 

126 shutil.copy(self.io_dict['in']['input_alphaC_path'], self.tmp_folder) 

127 shutil.copy(self.io_dict['in']['input_alphaW_path'], self.tmp_folder) 

128 shutil.copy(self.io_dict['in']['input_gammaC_path'], self.tmp_folder) 

129 shutil.copy(self.io_dict['in']['input_gammaW_path'], self.tmp_folder) 

130 

131 # read input files 

132 alphaC = read_series( 

133 self.io_dict['in']['input_alphaC_path'], usecols=self.seqpos) 

134 alphaW = read_series( 

135 self.io_dict['in']['input_alphaW_path'], usecols=self.seqpos) 

136 gammaC = read_series( 

137 self.io_dict['in']['input_gammaC_path'], usecols=self.seqpos) 

138 gammaW = read_series( 

139 self.io_dict['in']['input_gammaW_path'], usecols=self.seqpos) 

140 

141 # fix angle range so its not negative 

142 alphaC = self.fix_angles(alphaC) 

143 alphaW = self.fix_angles(alphaW) 

144 gammaC = self.fix_angles(gammaC) 

145 gammaW = self.fix_angles(gammaW) 

146 

147 # calculate difference between epsil and zeta parameters 

148 xlabels = self.get_xlabels( 

149 self.sequence, 

150 inverse_complement(self.sequence)) 

151 canonical_populations = self.check_alpha_gamma( 

152 alphaC, 

153 gammaC, 

154 alphaW, 

155 gammaW) 

156 

157 # save table 

158 canonical_populations.name = "Canonical alpha/gamma" 

159 ag_populations_df = pd.DataFrame({ 

160 "Nucleotide": xlabels, 

161 "Canonical alpha/gamma": canonical_populations}) 

162 ag_populations_df.to_csv( 

163 self.io_dict['out']['output_csv_path'], 

164 index=False) 

165 

166 # save plot 

167 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

168 axs.bar( 

169 range(len(xlabels)), 

170 canonical_populations, 

171 label="canonical alpha/gamma") 

172 axs.bar( 

173 range(len(xlabels)), 

174 100 - canonical_populations, 

175 bottom=canonical_populations, 

176 label=None) 

177 # empty bar to divide both sequences 

178 axs.bar( 

179 [len(alphaC.columns)], 

180 [100], 

181 color='white', 

182 label=None) 

183 axs.legend() 

184 axs.set_xticks(range(len(xlabels))) 

185 axs.set_xticklabels(xlabels, rotation=90) 

186 axs.set_xlabel("Nucleotide Sequence") 

187 axs.set_ylabel("Canonical Alpha-Gamma (%)") 

188 axs.set_title("Nucleotide parameter: Canonical Alpha-Gamma") 

189 fig.savefig( 

190 self.io_dict['out']['output_jpg_path'], 

191 format="jpg") 

192 plt.close() 

193 

194 # Remove temporary file(s) 

195 self.tmp_files.extend([ 

196 self.stage_io_dict.get("unique_dir"), 

197 self.tmp_folder 

198 ]) 

199 self.remove_tmp_files() 

200 

201 return 0 

202 

203 def get_xlabels(self, strand1, strand2): 

204 # get list of tetramers, except first and last two bases 

205 labelsW = list(strand1) 

206 labelsW[0] = f"{labelsW[0]}5\'" 

207 labelsW[-1] = f"{labelsW[-1]}3\'" 

208 labelsW = [ 

209 f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW)+1))] 

210 labelsC = list(strand2)[::-1] 

211 labelsC[0] = f"{labelsC[0]}5\'" 

212 labelsC[-1] = f"{labelsC[-1]}3\'" 

213 labelsC = [ 

214 f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

215 

216 if self.seqpos is not None: 

217 labelsC = [labelsC[i] for i in self.seqpos] 

218 labelsW = [labelsW[i] for i in self.seqpos] 

219 xlabels = labelsW + ['-'] + labelsC 

220 return xlabels 

221 

222 def check_alpha_gamma(self, alphaC, gammaC, alphaW, gammaW): 

223 separator_df = pd.DataFrame({"-": np.nan}, index=range(len(gammaW))) 

224 gamma = pd.concat([ 

225 gammaW, 

226 separator_df, 

227 gammaC[gammaC.columns[::-1]]], 

228 axis=1) 

229 alpha = pd.concat([ 

230 alphaW, 

231 separator_df, 

232 alphaC[alphaC.columns[::-1]]], 

233 axis=1) 

234 

235 alpha_filter = np.logical_and(alpha > 240, alpha < 360) 

236 gamma_filter = np.logical_and(gamma > 0, gamma < 120) 

237 canonical_alpha_gamma = np.logical_and( 

238 alpha_filter, gamma_filter).mean() * 100 

239 

240 return canonical_alpha_gamma 

241 

242 def fix_angles(self, dataset): 

243 values = np.where(dataset < 0, dataset + 360, dataset) 

244 values = np.where(values > 360, values - 360, values) 

245 dataset = pd.DataFrame(values) 

246 return dataset 

247 

248 

249def canonicalag( 

250 input_alphaC_path: str, input_alphaW_path: str, 

251 input_gammaC_path: str, input_gammaW_path: str, 

252 output_csv_path: str, output_jpg_path: str, 

253 properties: dict = None, **kwargs) -> int: 

254 """Create :class:`CanonicalAG <dna.backbone.canonicalag.CanonicalAG>` class and 

255 execute the: meth: `launch() <dna.backbone.canonicalag.CanonicalAG.launch>` method. """ 

256 

257 return CanonicalAG( 

258 input_alphaC_path=input_alphaC_path, 

259 input_alphaW_path=input_alphaW_path, 

260 input_gammaC_path=input_gammaC_path, 

261 input_gammaW_path=input_gammaW_path, 

262 output_csv_path=output_csv_path, 

263 output_jpg_path=output_jpg_path, 

264 properties=properties, **kwargs).launch() 

265 

266 

267def main(): 

268 """Command line execution of this building block. Please check the command line documentation.""" 

269 parser = argparse.ArgumentParser(description='Calculate Canonical Alpha/Gamma distributions.', 

270 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

271 parser.add_argument('--config', required=False, help='Configuration file') 

272 

273 required_args = parser.add_argument_group('required arguments') 

274 required_args.add_argument('--input_alphaC_path', required=True, 

275 help='Helical parameter <alphaC> input ser file path. Accepted formats: ser.') 

276 required_args.add_argument('--input_alphaW_path', required=True, 

277 help='Helical parameter <alphaW> input ser file path. Accepted formats: ser.') 

278 required_args.add_argument('--input_gammaC_path', required=True, 

279 help='Helical parameter <gammaC> input ser file path. Accepted formats: ser.') 

280 required_args.add_argument('--input_gammaW_path', required=True, 

281 help='Helical parameter <gammaW> input ser file path. Accepted formats: ser.') 

282 required_args.add_argument('--output_csv_path', required=True, 

283 help='Path to output csv file. Accepted formats: csv.') 

284 required_args.add_argument('--output_jpg_path', required=True, 

285 help='Path to output jpg file. Accepted formats: jpg.') 

286 

287 args = parser.parse_args() 

288 args.config = args.config or "{}" 

289 properties = settings.ConfReader(config=args.config).get_prop_dic() 

290 

291 canonicalag( 

292 input_alphaC_path=args.input_alphaC_path, 

293 input_alphaW_path=args.input_alphaW_path, 

294 input_gammaC_path=args.input_gammaC_path, 

295 input_gammaW_path=args.input_gammaW_path, 

296 output_csv_path=args.output_csv_path, 

297 output_jpg_path=args.output_jpg_path, 

298 properties=properties) 

299 

300 

301if __name__ == '__main__': 

302 main()