Coverage for biobb_dna/backbone/puckering.py: 80%

105 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 10:36 +0000

1#!/usr/bin/env python3 

2"""Module containing the Puckering class and the command line interface.""" 

3 

4import argparse 

5from typing import Optional 

6 

7import matplotlib.pyplot as plt 

8import numpy as np 

9import pandas as pd 

10from biobb_common.configuration import settings 

11from biobb_common.generic.biobb_object import BiobbObject 

12from biobb_common.tools.file_utils import launchlogger 

13 

14from biobb_dna.utils.common import _from_string_to_list 

15from biobb_dna.utils.loader import read_series 

16from biobb_dna.utils.transform import inverse_complement 

17 

18 

19class Puckering(BiobbObject): 

20 """ 

21 | biobb_dna Puckering 

22 | Calculate Puckering from phase parameters. 

23 | Calculate North/East/West/South distribution of sugar puckering backbone torsions. 

24 

25 Args: 

26 input_phaseC_path (str): Path to .ser file for helical parameter 'phaseC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseC.ser>`_. Accepted formats: ser (edam:format_2330). 

27 input_phaseW_path (str): Path to .ser file for helical parameter 'phaseW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseW.ser>`_. Accepted formats: ser (edam:format_2330). 

28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

30 properties (dict): 

31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

32 * **stride** (*int*) - (1000) granularity of the number of snapshots for plotting time series. 

33 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

34 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

35 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

36 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

37 

38 Examples: 

39 This is a use example of how to use the building block from Python:: 

40 

41 from biobb_dna.backbone.puckering import puckering 

42 

43 prop = { 

44 'sequence': 'GCAT', 

45 } 

46 puckering( 

47 input_phaseC_path='/path/to/phaseC.ser', 

48 input_phaseW_path='/path/to/phaseW.ser', 

49 output_csv_path='/path/to/table/output.csv', 

50 output_jpg_path='/path/to/table/output.jpg', 

51 properties=prop) 

52 Info: 

53 * wrapped_software: 

54 * name: In house 

55 * license: Apache-2.0 

56 * ontology: 

57 * name: EDAM 

58 * schema: http://edamontology.org/EDAM.owl 

59 

60 """ 

61 

62 def __init__( 

63 self, 

64 input_phaseC_path, 

65 input_phaseW_path, 

66 output_csv_path, 

67 output_jpg_path, 

68 properties=None, 

69 **kwargs, 

70 ) -> None: 

71 properties = properties or {} 

72 

73 # Call parent class constructor 

74 super().__init__(properties) 

75 self.locals_var_dict = locals().copy() 

76 

77 # Input/Output files 

78 self.io_dict = { 

79 "in": { 

80 "input_phaseC_path": input_phaseC_path, 

81 "input_phaseW_path": input_phaseW_path, 

82 }, 

83 "out": { 

84 "output_csv_path": output_csv_path, 

85 "output_jpg_path": output_jpg_path, 

86 }, 

87 } 

88 

89 self.properties = properties 

90 self.sequence = properties.get("sequence") 

91 self.stride = properties.get("stride", 1000) 

92 self.seqpos = [ 

93 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

94 ] 

95 

96 # Check the properties 

97 self.check_properties(properties) 

98 self.check_arguments() 

99 

100 @launchlogger 

101 def launch(self) -> int: 

102 """Execute the :class:`Puckering <backbone.puckering.Puckering>` object.""" 

103 

104 # Setup Biobb 

105 if self.check_restart(): 

106 return 0 

107 self.stage_files() 

108 

109 # check sequence 

110 if self.sequence is None or len(self.sequence) < 2: 

111 raise ValueError("sequence is null or too short!") 

112 

113 # check seqpos 

114 if self.seqpos: 

115 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

116 raise ValueError( 

117 f"seqpos values must be between 1 and {len(self.sequence) - 2}" 

118 ) 

119 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

120 raise ValueError("seqpos must be a list of at least two integers") 

121 else: 

122 self.seqpos = None # type: ignore 

123 

124 # read input files 

125 phaseC = read_series( 

126 self.stage_io_dict["in"]["input_phaseC_path"], usecols=self.seqpos 

127 ) 

128 phaseW = read_series( 

129 self.stage_io_dict["in"]["input_phaseW_path"], usecols=self.seqpos 

130 ) 

131 

132 # fix angle range so its not negative 

133 phaseC = self.fix_angles(phaseC) 

134 phaseW = self.fix_angles(phaseW) 

135 

136 # calculate difference between epsil and zeta parameters 

137 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence)) 

138 Npop, Epop, Wpop, Spop = self.check_puckering(phaseC, phaseW) 

139 

140 # save plot 

141 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

142 axs.bar(range(len(xlabels)), Npop, label="North") 

143 axs.bar(range(len(xlabels)), Epop, bottom=Npop, label="East") 

144 axs.bar(range(len(xlabels)), Spop, bottom=Npop + Epop, label="South") 

145 axs.bar(range(len(xlabels)), Wpop, bottom=Npop + Epop + Spop, label="West") 

146 # empty bar to divide both sequences 

147 axs.bar([len(phaseC.columns)], [100], color="white", label=None) 

148 axs.legend() 

149 axs.set_xticks(range(len(xlabels))) 

150 axs.set_xticklabels(xlabels, rotation=90) 

151 axs.set_xlabel("Nucleotide Sequence") 

152 axs.set_ylabel("Puckering (%)") 

153 axs.set_title("Nucleotide parameter: Puckering") 

154 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

155 

156 # save table 

157 populations = pd.DataFrame( 

158 { 

159 "Nucleotide": xlabels, 

160 "North": Npop, 

161 "East": Epop, 

162 "West": Wpop, 

163 "South": Spop, 

164 } 

165 ) 

166 populations.to_csv(self.stage_io_dict["out"]["output_csv_path"], index=False) 

167 

168 plt.close() 

169 

170 # Copy files to host 

171 self.copy_to_host() 

172 

173 # Remove temporary file(s) 

174 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

175 self.remove_tmp_files() 

176 

177 self.check_arguments(output_files_created=True, raise_exception=False) 

178 

179 return 0 

180 

181 def get_xlabels(self, strand1, strand2): 

182 # get list of tetramers, except first and last two bases 

183 labelsW = list(strand1) 

184 labelsW[0] = f"{labelsW[0]}5'" 

185 labelsW[-1] = f"{labelsW[-1]}3'" 

186 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))] 

187 labelsC = list(strand2)[::-1] 

188 labelsC[0] = f"{labelsC[0]}5'" 

189 labelsC[-1] = f"{labelsC[-1]}3'" 

190 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

191 

192 if self.seqpos: 

193 labelsC = [labelsC[i] for i in self.seqpos] 

194 labelsW = [labelsW[i] for i in self.seqpos] 

195 xlabels = labelsW + ["-"] + labelsC 

196 return xlabels 

197 

198 def check_puckering(self, phaseC, phaseW): 

199 separator_df = pd.DataFrame({"-": np.nan}, index=range(1, len(phaseC))) 

200 phase = pd.concat([phaseW, separator_df, phaseC[phaseC.columns[::-1]]], axis=1) 

201 # phase.columns = columns 

202 

203 Npop = np.logical_or(phase > 315, phase < 45).mean() * 100 

204 Epop = np.logical_and(phase > 45, phase < 135).mean() * 100 

205 Wpop = np.logical_and(phase > 225, phase < 315).mean() * 100 

206 Spop = np.logical_and(phase > 135, phase < 225).mean() * 100 

207 return Npop, Epop, Wpop, Spop 

208 

209 def fix_angles(self, dataset): 

210 values = np.where(dataset < 0, dataset + 360, dataset) 

211 # values = np.where(values > 360, values - 360, values) 

212 dataset = pd.DataFrame(values) 

213 return dataset 

214 

215 

216def puckering( 

217 input_phaseC_path: str, 

218 input_phaseW_path: str, 

219 output_csv_path: str, 

220 output_jpg_path: str, 

221 properties: Optional[dict] = None, 

222 **kwargs, 

223) -> int: 

224 """Create :class:`Puckering <dna.backbone.puckering.Puckering>` class and 

225 execute the: meth: `launch() <dna.backbone.puckering.Puckering.launch>` method.""" 

226 

227 return Puckering( 

228 input_phaseC_path=input_phaseC_path, 

229 input_phaseW_path=input_phaseW_path, 

230 output_csv_path=output_csv_path, 

231 output_jpg_path=output_jpg_path, 

232 properties=properties, 

233 **kwargs, 

234 ).launch() 

235 

236 puckering.__doc__ = Puckering.__doc__ 

237 

238 

239def main(): 

240 """Command line execution of this building block. Please check the command line documentation.""" 

241 parser = argparse.ArgumentParser( 

242 description="Calculate North/East/West/South distribution of sugar puckering backbone torsions.", 

243 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

244 ) 

245 parser.add_argument("--config", required=False, help="Configuration file") 

246 

247 required_args = parser.add_argument_group("required arguments") 

248 required_args.add_argument( 

249 "--input_phaseC_path", 

250 required=True, 

251 help="Helical parameter <alphaC> input ser file path. Accepted formats: ser.", 

252 ) 

253 required_args.add_argument( 

254 "--input_phaseW_path", 

255 required=True, 

256 help="Helical parameter <alphaW> input ser file path. Accepted formats: ser.", 

257 ) 

258 required_args.add_argument( 

259 "--output_csv_path", 

260 required=True, 

261 help="Path to output csv file. Accepted formats: csv.", 

262 ) 

263 required_args.add_argument( 

264 "--output_jpg_path", 

265 required=True, 

266 help="Path to output jpg file. Accepted formats: jpg.", 

267 ) 

268 

269 args = parser.parse_args() 

270 args.config = args.config or "{}" 

271 properties = settings.ConfReader(config=args.config).get_prop_dic() 

272 

273 puckering( 

274 input_phaseC_path=args.input_phaseC_path, 

275 input_phaseW_path=args.input_phaseW_path, 

276 output_csv_path=args.output_csv_path, 

277 output_jpg_path=args.output_jpg_path, 

278 properties=properties, 

279 ) 

280 

281 

282if __name__ == "__main__": 

283 main()