Coverage for biobb_dna/backbone/bipopulations.py: 77%

102 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 10:36 +0000

1#!/usr/bin/env python3 

2"""Module containing the BIPopulations class and the command line interface.""" 

3 

4import argparse 

5from typing import Optional 

6 

7import matplotlib.pyplot as plt 

8import pandas as pd 

9from biobb_common.configuration import settings 

10from biobb_common.generic.biobb_object import BiobbObject 

11from biobb_common.tools.file_utils import launchlogger 

12from numpy import nan 

13 

14from biobb_dna.utils.common import _from_string_to_list 

15from biobb_dna.utils.loader import read_series 

16from biobb_dna.utils.transform import inverse_complement 

17 

18 

19class BIPopulations(BiobbObject): 

20 """ 

21 | biobb_dna BIPopulations 

22 | Calculate BI/BII populations from epsilon and zeta parameters. 

23 | Calculate BI/BII populations from epsilon and zeta parameters. 

24 

25 Args: 

26 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330). 

27 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330). 

28 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330). 

29 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330). 

30 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

32 properties (dict): 

33 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

34 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

35 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

36 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

37 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

38 

39 Examples: 

40 This is a use example of how to use the building block from Python:: 

41 

42 from biobb_dna.backbone.bipopulations import bipopulations 

43 

44 prop = { 

45 'sequence': 'GCAT', 

46 } 

47 bipopulations( 

48 input_epsilC_path='/path/to/epsilC.ser', 

49 input_epsilW_path='/path/to/epsilW.ser', 

50 input_zetaC_path='/path/to/zetaC.ser', 

51 input_zetaW_path='/path/to/zetaW.ser', 

52 output_csv_path='/path/to/table/output.csv', 

53 output_jpg_path='/path/to/table/output.jpg', 

54 properties=prop) 

55 Info: 

56 * wrapped_software: 

57 * name: In house 

58 * license: Apache-2.0 

59 * ontology: 

60 * name: EDAM 

61 * schema: http://edamontology.org/EDAM.owl 

62 

63 """ 

64 

65 def __init__( 

66 self, 

67 input_epsilC_path, 

68 input_epsilW_path, 

69 input_zetaC_path, 

70 input_zetaW_path, 

71 output_csv_path, 

72 output_jpg_path, 

73 properties=None, 

74 **kwargs, 

75 ) -> None: 

76 properties = properties or {} 

77 

78 # Call parent class constructor 

79 super().__init__(properties) 

80 self.locals_var_dict = locals().copy() 

81 

82 # Input/Output files 

83 self.io_dict = { 

84 "in": { 

85 "input_epsilC_path": input_epsilC_path, 

86 "input_epsilW_path": input_epsilW_path, 

87 "input_zetaC_path": input_zetaC_path, 

88 "input_zetaW_path": input_zetaW_path, 

89 }, 

90 "out": { 

91 "output_csv_path": output_csv_path, 

92 "output_jpg_path": output_jpg_path, 

93 }, 

94 } 

95 

96 self.properties = properties 

97 self.sequence = properties.get("sequence") 

98 # self.seqpos = properties.get("seqpos", None) 

99 self.seqpos = [ 

100 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

101 ] 

102 print("seqpos::::::::") 

103 print(self.seqpos) 

104 # Check the properties 

105 self.check_properties(properties) 

106 self.check_arguments() 

107 

108 @launchlogger 

109 def launch(self) -> int: 

110 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object.""" 

111 

112 # Setup Biobb 

113 if self.check_restart(): 

114 return 0 

115 self.stage_files() 

116 

117 # check sequence 

118 if self.sequence is None or len(self.sequence) < 2: 

119 raise ValueError("sequence is null or too short!") 

120 

121 # check seqpos 

122 if self.seqpos: 

123 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

124 raise ValueError( 

125 f"seqpos values must be between 1 and {len(self.sequence) - 2}" 

126 ) 

127 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

128 raise ValueError("seqpos must be a list of at least two integers") 

129 else: 

130 self.seqpos = None # type: ignore 

131 

132 # read input files 

133 epsilC = read_series( 

134 self.stage_io_dict["in"]["input_epsilC_path"], usecols=self.seqpos 

135 ) 

136 epsilW = read_series( 

137 self.stage_io_dict["in"]["input_epsilW_path"], usecols=self.seqpos 

138 ) 

139 zetaC = read_series( 

140 self.stage_io_dict["in"]["input_zetaC_path"], usecols=self.seqpos 

141 ) 

142 zetaW = read_series( 

143 self.stage_io_dict["in"]["input_zetaW_path"], usecols=self.seqpos 

144 ) 

145 

146 # calculate difference between epsil and zeta parameters 

147 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence)) 

148 diff_epsil_zeta = self.get_angles_difference(epsilC, zetaC, epsilW, zetaW) 

149 

150 # calculate BI population 

151 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta) 

152 BII = 100 - BI 

153 

154 # save table 

155 Bpopulations_df = pd.DataFrame( 

156 {"Nucleotide": xlabels, "BI population": BI, "BII population": BII} 

157 ) 

158 Bpopulations_df.to_csv( 

159 self.stage_io_dict["out"]["output_csv_path"], index=False 

160 ) 

161 

162 # save plot 

163 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

164 axs.bar(range(len(xlabels)), BI, label="BI") 

165 axs.bar(range(len(xlabels)), BII, bottom=BI, label="BII") 

166 # empty bar to divide both sequences 

167 axs.bar([len(BI) // 2], [100], color="white", label=None) 

168 axs.legend() 

169 axs.set_xticks(range(len(xlabels))) 

170 axs.set_xticklabels(xlabels, rotation=90) 

171 axs.set_xlabel("Nucleotide Sequence") 

172 axs.set_ylabel("BI/BII Population (%)") 

173 axs.set_title("Nucleotide parameter: BI/BII Population") 

174 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

175 plt.close() 

176 

177 # Copy files to host 

178 self.copy_to_host() 

179 

180 # Remove temporary file(s) 

181 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

182 self.remove_tmp_files() 

183 

184 self.check_arguments(output_files_created=True, raise_exception=False) 

185 

186 return 0 

187 

188 def get_xlabels(self, strand1, strand2): 

189 # get list of tetramers, except first and last two bases 

190 labelsW = list(strand1) 

191 labelsW[0] = f"{labelsW[0]}5'" 

192 labelsW[-1] = f"{labelsW[-1]}3'" 

193 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))] 

194 labelsC = list(strand2)[::-1] 

195 labelsC[0] = f"{labelsC[0]}5'" 

196 labelsC[-1] = f"{labelsC[-1]}3'" 

197 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

198 

199 if self.seqpos is not None: 

200 labelsC = [labelsC[i] for i in self.seqpos] 

201 labelsW = [labelsW[i] for i in self.seqpos] 

202 xlabels = labelsW + ["-"] + labelsC 

203 return xlabels 

204 

205 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW): 

206 # concatenate zeta and epsil arrays 

207 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW))) 

208 zeta = pd.concat([zetaW, separator_df, zetaC[zetaC.columns[::-1]]], axis=1) 

209 epsil = pd.concat([epsilW, separator_df, epsilC[epsilC.columns[::-1]]], axis=1) 

210 

211 # difference between epsilon and zeta coordinates 

212 diff_epsil_zeta = epsil - zeta 

213 return diff_epsil_zeta 

214 

215 

216def bipopulations( 

217 input_epsilC_path: str, 

218 input_epsilW_path: str, 

219 input_zetaC_path: str, 

220 input_zetaW_path: str, 

221 output_csv_path: str, 

222 output_jpg_path: str, 

223 properties: Optional[dict] = None, 

224 **kwargs, 

225) -> int: 

226 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and 

227 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method.""" 

228 

229 return BIPopulations( 

230 input_epsilC_path=input_epsilC_path, 

231 input_epsilW_path=input_epsilW_path, 

232 input_zetaC_path=input_zetaC_path, 

233 input_zetaW_path=input_zetaW_path, 

234 output_csv_path=output_csv_path, 

235 output_jpg_path=output_jpg_path, 

236 properties=properties, 

237 **kwargs, 

238 ).launch() 

239 

240 bipopulations.__doc__ = BIPopulations.__doc__ 

241 

242 

243def main(): 

244 """Command line execution of this building block. Please check the command line documentation.""" 

245 parser = argparse.ArgumentParser( 

246 description="Calculate BI/BII populations.", 

247 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

248 ) 

249 parser.add_argument("--config", required=False, help="Configuration file") 

250 

251 required_args = parser.add_argument_group("required arguments") 

252 required_args.add_argument( 

253 "--input_epsilC_path", 

254 required=True, 

255 help="Helical parameter <epsilC> input ser file path. Accepted formats: ser.", 

256 ) 

257 required_args.add_argument( 

258 "--input_epsilW_path", 

259 required=True, 

260 help="Helical parameter <epsilW> input ser file path. Accepted formats: ser.", 

261 ) 

262 required_args.add_argument( 

263 "--input_zetaC_path", 

264 required=True, 

265 help="Helical parameter <zetaC> input ser file path. Accepted formats: ser.", 

266 ) 

267 required_args.add_argument( 

268 "--input_zetaW_path", 

269 required=True, 

270 help="Helical parameter <zetaW> input ser file path. Accepted formats: ser.", 

271 ) 

272 required_args.add_argument( 

273 "--output_csv_path", 

274 required=True, 

275 help="Path to output csv file. Accepted formats: csv.", 

276 ) 

277 required_args.add_argument( 

278 "--output_jpg_path", 

279 required=True, 

280 help="Path to output jpg file. Accepted formats: jpg.", 

281 ) 

282 

283 args = parser.parse_args() 

284 args.config = args.config or "{}" 

285 properties = settings.ConfReader(config=args.config).get_prop_dic() 

286 

287 bipopulations( 

288 input_epsilC_path=args.input_epsilC_path, 

289 input_epsilW_path=args.input_epsilW_path, 

290 input_zetaC_path=args.input_zetaC_path, 

291 input_zetaW_path=args.input_zetaW_path, 

292 output_csv_path=args.output_csv_path, 

293 output_jpg_path=args.output_jpg_path, 

294 properties=properties, 

295 ) 

296 

297 

298if __name__ == "__main__": 

299 main()