Coverage for biobb_dna/backbone/bipopulations.py: 78%

1#!/usr/bin/env python3

2"""Module containing the BIPopulations class and the command line interface."""

4import argparse

5from typing import Optional

7import matplotlib.pyplot as plt

8import pandas as pd

9from biobb_common.configuration import settings

10from biobb_common.generic.biobb_object import BiobbObject

11from biobb_common.tools.file_utils import launchlogger

12from numpy import nan

14from biobb_dna.utils.common import _from_string_to_list

15from biobb_dna.utils.loader import read_series

16from biobb_dna.utils.transform import inverse_complement

19class BIPopulations(BiobbObject):

20 """

21 | biobb_dna BIPopulations

22 | Calculate BI/BII populations from epsilon and zeta parameters.

23 | Calculate BI/BII populations from epsilon and zeta parameters.

25 Args:

26 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330).

27 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330).

28 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330).

29 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330).

30 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752).

31 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579).

32 properties (dict):

33 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option).

34 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence.

35 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.

36 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

37 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.

39 Examples:

40 This is a use example of how to use the building block from Python::

42 from biobb_dna.backbone.bipopulations import bipopulations

44 prop = {

45 'sequence': 'GCAT',

46 }

47 bipopulations(

48 input_epsilC_path='/path/to/epsilC.ser',

49 input_epsilW_path='/path/to/epsilW.ser',

50 input_zetaC_path='/path/to/zetaC.ser',

51 input_zetaW_path='/path/to/zetaW.ser',

52 output_csv_path='/path/to/table/output.csv',

53 output_jpg_path='/path/to/table/output.jpg',

54 properties=prop)

55 Info:

56 * wrapped_software:

57 * name: In house

58 * license: Apache-2.0

59 * ontology:

60 * name: EDAM

61 * schema: http://edamontology.org/EDAM.owl

63 """

65 def __init__(

66 self,

67 input_epsilC_path,

68 input_epsilW_path,

69 input_zetaC_path,

70 input_zetaW_path,

71 output_csv_path,

72 output_jpg_path,

73 properties=None,

74 **kwargs,

75 ) -> None:

76 properties = properties or {}

78 # Call parent class constructor

79 super().__init__(properties)

80 self.locals_var_dict = locals().copy()

82 # Input/Output files

83 self.io_dict = {

84 "in": {

85 "input_epsilC_path": input_epsilC_path,

86 "input_epsilW_path": input_epsilW_path,

87 "input_zetaC_path": input_zetaC_path,

88 "input_zetaW_path": input_zetaW_path,

89 },

90 "out": {

91 "output_csv_path": output_csv_path,

92 "output_jpg_path": output_jpg_path,

93 },

94 }

96 self.properties = properties

97 self.sequence = properties.get("sequence")

98 # self.seqpos = properties.get("seqpos", None)

99 self.seqpos = [

100 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None))

101 ]

102 print("seqpos::::::::")

103 print(self.seqpos)

104 # Check the properties

105 self.check_properties(properties)

106 self.check_arguments()

107

108 @launchlogger

109 def launch(self) -> int:

110 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object."""

111

112 # Setup Biobb

113 if self.check_restart():

114 return 0

115 self.stage_files()

116

117 # check sequence

118 if self.sequence is None or len(self.sequence) < 2:

119 raise ValueError("sequence is null or too short!")

120

121 # check seqpos

122 if self.seqpos:

123 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1):

124 raise ValueError(

125 f"seqpos values must be between 1 and {len(self.sequence) - 2}"

126 )

127 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1):

128 raise ValueError("seqpos must be a list of at least two integers")

129 else:

130 self.seqpos = None # type: ignore

131

132 # read input files

133 epsilC = read_series(

134 self.stage_io_dict["in"]["input_epsilC_path"], usecols=self.seqpos

135 )

136 epsilW = read_series(

137 self.stage_io_dict["in"]["input_epsilW_path"], usecols=self.seqpos

138 )

139 zetaC = read_series(

140 self.stage_io_dict["in"]["input_zetaC_path"], usecols=self.seqpos

141 )

142 zetaW = read_series(

143 self.stage_io_dict["in"]["input_zetaW_path"], usecols=self.seqpos

144 )

145

146 # calculate difference between epsil and zeta parameters

147 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence))

148 diff_epsil_zeta = self.get_angles_difference(epsilC, zetaC, epsilW, zetaW)

149

150 # calculate BI population

151 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta)

152 BII = 100 - BI

153

154 # save table

155 Bpopulations_df = pd.DataFrame(

156 {"Nucleotide": xlabels, "BI population": BI, "BII population": BII}

157 )

158 Bpopulations_df.to_csv(

159 self.stage_io_dict["out"]["output_csv_path"], index=False

160 )

161

162 # save plot

163 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True)

164 axs.bar(range(len(xlabels)), BI, label="BI")

165 axs.bar(range(len(xlabels)), BII, bottom=BI, label="BII")

166 # empty bar to divide both sequences

167 axs.bar([len(BI) // 2], [100], color="white", label=None)

168 axs.legend()

169 axs.set_xticks(range(len(xlabels)))

170 axs.set_xticklabels(xlabels, rotation=90)

171 axs.set_xlabel("Nucleotide Sequence")

172 axs.set_ylabel("BI/BII Population (%)")

173 axs.set_title("Nucleotide parameter: BI/BII Population")

174 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg")

175 plt.close()

176

177 # Copy files to host

178 self.copy_to_host()

179

180 # Remove temporary file(s)

181 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")])

182 self.remove_tmp_files()

183

184 self.check_arguments(output_files_created=True, raise_exception=False)

185

186 return 0

187

188 def get_xlabels(self, strand1, strand2):

189 # get list of tetramers, except first and last two bases

190 labelsW = list(strand1)

191 labelsW[0] = f"{labelsW[0]}5'"

192 labelsW[-1] = f"{labelsW[-1]}3'"

193 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))]

194 labelsC = list(strand2)[::-1]

195 labelsC[0] = f"{labelsC[0]}5'"

196 labelsC[-1] = f"{labelsC[-1]}3'"

197 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))]

198

199 if self.seqpos is not None:

200 labelsC = [labelsC[i] for i in self.seqpos]

201 labelsW = [labelsW[i] for i in self.seqpos]

202 xlabels = labelsW + ["-"] + labelsC

203 return xlabels

204

205 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW):

206 # concatenate zeta and epsil arrays

207 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW)))

208 zeta = pd.concat([zetaW, separator_df, zetaC[zetaC.columns[::-1]]], axis=1)

209 epsil = pd.concat([epsilW, separator_df, epsilC[epsilC.columns[::-1]]], axis=1)

210

211 # difference between epsilon and zeta coordinates

212 diff_epsil_zeta = epsil - zeta

213 return diff_epsil_zeta

214

215

216def bipopulations(

217 input_epsilC_path: str,

218 input_epsilW_path: str,

219 input_zetaC_path: str,

220 input_zetaW_path: str,

221 output_csv_path: str,

222 output_jpg_path: str,

223 properties: Optional[dict] = None,

224 **kwargs,

225) -> int:

226 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and

227 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method."""

228

229 return BIPopulations(

230 input_epsilC_path=input_epsilC_path,

231 input_epsilW_path=input_epsilW_path,

232 input_zetaC_path=input_zetaC_path,

233 input_zetaW_path=input_zetaW_path,

234 output_csv_path=output_csv_path,

235 output_jpg_path=output_jpg_path,

236 properties=properties,

237 **kwargs,

238 ).launch()

239

240 bipopulations.__doc__ = BIPopulations.__doc__

241

242

243def main():

244 """Command line execution of this building block. Please check the command line documentation."""

245 parser = argparse.ArgumentParser(

246 description="Calculate BI/BII populations.",

247 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999),

248 )

249 parser.add_argument("--config", required=False, help="Configuration file")

250

251 required_args = parser.add_argument_group("required arguments")

252 required_args.add_argument(

253 "--input_epsilC_path",

254 required=True,

255 help="Helical parameter <epsilC> input ser file path. Accepted formats: ser.",

256 )

257 required_args.add_argument(

258 "--input_epsilW_path",

259 required=True,

260 help="Helical parameter <epsilW> input ser file path. Accepted formats: ser.",

261 )

262 required_args.add_argument(

263 "--input_zetaC_path",

264 required=True,

265 help="Helical parameter <zetaC> input ser file path. Accepted formats: ser.",

266 )

267 required_args.add_argument(

268 "--input_zetaW_path",

269 required=True,

270 help="Helical parameter <zetaW> input ser file path. Accepted formats: ser.",

271 )

272 required_args.add_argument(

273 "--output_csv_path",

274 required=True,

275 help="Path to output csv file. Accepted formats: csv.",

276 )

277 required_args.add_argument(

278 "--output_jpg_path",

279 required=True,

280 help="Path to output jpg file. Accepted formats: jpg.",

281 )

282

283 args = parser.parse_args()

284 args.config = args.config or "{}"

285 properties = settings.ConfReader(config=args.config).get_prop_dic()

286

287 bipopulations(

288 input_epsilC_path=args.input_epsilC_path,

289 input_epsilW_path=args.input_epsilW_path,

290 input_zetaC_path=args.input_zetaC_path,

291 input_zetaW_path=args.input_zetaW_path,

292 output_csv_path=args.output_csv_path,

293 output_jpg_path=args.output_jpg_path,

294 properties=properties,

295 )

296

297

298if __name__ == "__main__":

299 main()