Coverage for biobb_dna/backbone/bipopulations.py: 79%

105 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:06 +0000

1#!/usr/bin/env python3 

2"""Module containing the BIPopulations class and the command line interface.""" 

3 

4import shutil 

5import argparse 

6 

7import matplotlib.pyplot as plt 

8import pandas as pd 

9from numpy import nan 

10from biobb_dna.utils.loader import read_series 

11from biobb_dna.utils.transform import inverse_complement 

12from biobb_common.generic.biobb_object import BiobbObject 

13from biobb_common.tools.file_utils import launchlogger 

14from biobb_common.tools import file_utils as fu 

15from biobb_common.configuration import settings 

16 

17 

18class BIPopulations(BiobbObject): 

19 """ 

20 | biobb_dna BIPopulations 

21 | Calculate BI/BII populations from epsilon and zeta parameters. 

22 

23 Args: 

24 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330). 

25 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330). 

26 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330). 

27 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330). 

28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

30 properties (dict): 

31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

35 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.backbone.bipopulations import bipopulations 

40 

41 prop = { 

42 'sequence': 'GCAT', 

43 } 

44 bipopulations( 

45 input_epsilC_path='/path/to/epsilC.ser', 

46 input_epsilW_path='/path/to/epsilW.ser', 

47 input_zetaC_path='/path/to/zetaC.ser', 

48 input_zetaW_path='/path/to/zetaW.ser', 

49 output_csv_path='/path/to/table/output.csv', 

50 output_jpg_path='/path/to/table/output.jpg', 

51 properties=prop) 

52 Info: 

53 * wrapped_software: 

54 * name: In house 

55 * license: Apache-2.0 

56 * ontology: 

57 * name: EDAM 

58 * schema: http://edamontology.org/EDAM.owl 

59 

60 """ 

61 

62 def __init__(self, input_epsilC_path, input_epsilW_path, 

63 input_zetaC_path, input_zetaW_path, 

64 output_csv_path, output_jpg_path, 

65 properties=None, **kwargs) -> None: 

66 properties = properties or {} 

67 

68 # Call parent class constructor 

69 super().__init__(properties) 

70 self.locals_var_dict = locals().copy() 

71 

72 # Input/Output files 

73 self.io_dict = { 

74 'in': { 

75 'input_epsilC_path': input_epsilC_path, 

76 'input_epsilW_path': input_epsilW_path, 

77 'input_zetaC_path': input_zetaC_path, 

78 'input_zetaW_path': input_zetaW_path 

79 }, 

80 'out': { 

81 'output_csv_path': output_csv_path, 

82 'output_jpg_path': output_jpg_path 

83 } 

84 } 

85 

86 self.properties = properties 

87 self.sequence = properties.get("sequence") 

88 self.seqpos = properties.get("seqpos", None) 

89 

90 # Check the properties 

91 self.check_properties(properties) 

92 self.check_arguments() 

93 

94 @launchlogger 

95 def launch(self) -> int: 

96 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object.""" 

97 

98 # Setup Biobb 

99 if self.check_restart(): 

100 return 0 

101 self.stage_files() 

102 

103 # check sequence 

104 if self.sequence is None or len(self.sequence) < 2: 

105 raise ValueError("sequence is null or too short!") 

106 

107 # check seqpos 

108 if self.seqpos is not None: 

109 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

110 raise ValueError( 

111 f"seqpos values must be between 1 and {len(self.sequence) - 2}") 

112 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

113 raise ValueError( 

114 "seqpos must be a list of at least two integers") 

115 

116 # Creating temporary folder 

117 self.tmp_folder = fu.create_unique_dir(prefix="backbone_") 

118 fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log) 

119 

120 # Copy input_file_path1 to temporary folder 

121 shutil.copy(self.io_dict['in']['input_epsilC_path'], self.tmp_folder) 

122 shutil.copy(self.io_dict['in']['input_epsilW_path'], self.tmp_folder) 

123 shutil.copy(self.io_dict['in']['input_zetaC_path'], self.tmp_folder) 

124 shutil.copy(self.io_dict['in']['input_zetaW_path'], self.tmp_folder) 

125 

126 # read input files 

127 epsilC = read_series( 

128 self.io_dict['in']['input_epsilC_path'], 

129 usecols=self.seqpos) 

130 epsilW = read_series( 

131 self.io_dict['in']['input_epsilW_path'], 

132 usecols=self.seqpos) 

133 zetaC = read_series( 

134 self.io_dict['in']['input_zetaC_path'], 

135 usecols=self.seqpos) 

136 zetaW = read_series( 

137 self.io_dict['in']['input_zetaW_path'], 

138 usecols=self.seqpos) 

139 

140 # calculate difference between epsil and zeta parameters 

141 xlabels = self.get_xlabels( 

142 self.sequence, 

143 inverse_complement(self.sequence)) 

144 diff_epsil_zeta = self.get_angles_difference( 

145 epsilC, 

146 zetaC, 

147 epsilW, 

148 zetaW) 

149 

150 # calculate BI population 

151 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta) 

152 BII = 100 - BI 

153 

154 # save table 

155 Bpopulations_df = pd.DataFrame({ 

156 "Nucleotide": xlabels, 

157 "BI population": BI, 

158 "BII population": BII}) 

159 Bpopulations_df.to_csv( 

160 self.io_dict['out']['output_csv_path'], 

161 index=False) 

162 

163 # save plot 

164 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

165 axs.bar( 

166 range(len(xlabels)), 

167 BI, 

168 label="BI") 

169 axs.bar( 

170 range(len(xlabels)), 

171 BII, 

172 bottom=BI, 

173 label="BII") 

174 # empty bar to divide both sequences 

175 axs.bar( 

176 [len(BI)//2], 

177 [100], 

178 color='white', 

179 label=None) 

180 axs.legend() 

181 axs.set_xticks(range(len(xlabels))) 

182 axs.set_xticklabels(xlabels, rotation=90) 

183 axs.set_xlabel("Nucleotide Sequence") 

184 axs.set_ylabel("BI/BII Population (%)") 

185 axs.set_title("Nucleotide parameter: BI/BII Population") 

186 fig.savefig( 

187 self.io_dict['out']['output_jpg_path'], 

188 format="jpg") 

189 plt.close() 

190 

191 # Remove temporary file(s) 

192 self.tmp_files.extend([ 

193 self.stage_io_dict.get("unique_dir"), 

194 self.tmp_folder 

195 ]) 

196 self.remove_tmp_files() 

197 

198 self.check_arguments(output_files_created=True, raise_exception=False) 

199 

200 return 0 

201 

202 def get_xlabels(self, strand1, strand2): 

203 # get list of tetramers, except first and last two bases 

204 labelsW = list(strand1) 

205 labelsW[0] = f"{labelsW[0]}5\'" 

206 labelsW[-1] = f"{labelsW[-1]}3\'" 

207 labelsW = [ 

208 f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW)+1))] 

209 labelsC = list(strand2)[::-1] 

210 labelsC[0] = f"{labelsC[0]}5\'" 

211 labelsC[-1] = f"{labelsC[-1]}3\'" 

212 labelsC = [ 

213 f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

214 

215 if self.seqpos is not None: 

216 labelsC = [labelsC[i] for i in self.seqpos] 

217 labelsW = [labelsW[i] for i in self.seqpos] 

218 xlabels = labelsW + ['-'] + labelsC 

219 return xlabels 

220 

221 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW): 

222 # concatenate zeta and epsil arrays 

223 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW))) 

224 zeta = pd.concat([ 

225 zetaW, 

226 separator_df, 

227 zetaC[zetaC.columns[::-1]]], 

228 axis=1) 

229 epsil = pd.concat([ 

230 epsilW, 

231 separator_df, 

232 epsilC[epsilC.columns[::-1]]], 

233 axis=1) 

234 

235 # difference between epsilon and zeta coordinates 

236 diff_epsil_zeta = epsil - zeta 

237 return diff_epsil_zeta 

238 

239 

240def bipopulations( 

241 input_epsilC_path: str, input_epsilW_path: str, 

242 input_zetaC_path: str, input_zetaW_path: str, 

243 output_csv_path: str, output_jpg_path: str, 

244 properties: dict = None, **kwargs) -> int: 

245 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and 

246 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method. """ 

247 

248 return BIPopulations( 

249 input_epsilC_path=input_epsilC_path, 

250 input_epsilW_path=input_epsilW_path, 

251 input_zetaC_path=input_zetaC_path, 

252 input_zetaW_path=input_zetaW_path, 

253 output_csv_path=output_csv_path, 

254 output_jpg_path=output_jpg_path, 

255 properties=properties, **kwargs).launch() 

256 

257 

258def main(): 

259 """Command line execution of this building block. Please check the command line documentation.""" 

260 parser = argparse.ArgumentParser(description='Calculate BI/BII populations.', 

261 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

262 parser.add_argument('--config', required=False, help='Configuration file') 

263 

264 required_args = parser.add_argument_group('required arguments') 

265 required_args.add_argument('--input_epsilC_path', required=True, 

266 help='Helical parameter <epsilC> input ser file path. Accepted formats: ser.') 

267 required_args.add_argument('--input_epsilW_path', required=True, 

268 help='Helical parameter <epsilW> input ser file path. Accepted formats: ser.') 

269 required_args.add_argument('--input_zetaC_path', required=True, 

270 help='Helical parameter <zetaC> input ser file path. Accepted formats: ser.') 

271 required_args.add_argument('--input_zetaW_path', required=True, 

272 help='Helical parameter <zetaW> input ser file path. Accepted formats: ser.') 

273 required_args.add_argument('--output_csv_path', required=True, 

274 help='Path to output csv file. Accepted formats: csv.') 

275 required_args.add_argument('--output_jpg_path', required=True, 

276 help='Path to output jpg file. Accepted formats: jpg.') 

277 

278 args = parser.parse_args() 

279 args.config = args.config or "{}" 

280 properties = settings.ConfReader(config=args.config).get_prop_dic() 

281 

282 bipopulations( 

283 input_epsilC_path=args.input_epsilC_path, 

284 input_epsilW_path=args.input_epsilW_path, 

285 input_zetaC_path=args.input_zetaC_path, 

286 input_zetaW_path=args.input_zetaW_path, 

287 output_csv_path=args.output_csv_path, 

288 output_jpg_path=args.output_jpg_path, 

289 properties=properties) 

290 

291 

292if __name__ == '__main__': 

293 main()