Coverage for biobb_dna/backbone/puckering.py: 81%

108 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-07 09:06 +0000

1#!/usr/bin/env python3 

2"""Module containing the Puckering class and the command line interface.""" 

3 

4import shutil 

5import argparse 

6 

7import matplotlib.pyplot as plt 

8import pandas as pd 

9import numpy as np 

10from biobb_dna.utils.loader import read_series 

11from biobb_dna.utils.transform import inverse_complement 

12from biobb_common.generic.biobb_object import BiobbObject 

13from biobb_common.tools.file_utils import launchlogger 

14from biobb_common.tools import file_utils as fu 

15from biobb_common.configuration import settings 

16 

17 

18class Puckering(BiobbObject): 

19 """ 

20 | biobb_dna Puckering 

21 | Calculate Puckering from phase parameters. 

22 

23 Args: 

24 input_phaseC_path (str): Path to .ser file for helical parameter 'phaseC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseC.ser>`_. Accepted formats: ser (edam:format_2330). 

25 input_phaseW_path (str): Path to .ser file for helical parameter 'phaseW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseW.ser>`_. Accepted formats: ser (edam:format_2330). 

26 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

28 properties (dict): 

29 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

30 * **helpar_name** (*str*) - (None) helical parameter name. 

31 * **stride** (*int*) - (1000) granularity of the number of snapshots for plotting time series. 

32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

35 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.backbone.puckering import puckering 

40 

41 prop = { 

42 'sequence': 'GCAT', 

43 } 

44 puckering( 

45 input_phaseC_path='/path/to/phaseC.ser', 

46 input_phaseW_path='/path/to/phaseW.ser', 

47 output_csv_path='/path/to/table/output.csv', 

48 output_jpg_path='/path/to/table/output.jpg', 

49 properties=prop) 

50 Info: 

51 * wrapped_software: 

52 * name: In house 

53 * license: Apache-2.0 

54 * ontology: 

55 * name: EDAM 

56 * schema: http://edamontology.org/EDAM.owl 

57 

58 """ 

59 

60 def __init__(self, input_phaseC_path, input_phaseW_path, 

61 output_csv_path, output_jpg_path, 

62 properties=None, **kwargs) -> None: 

63 properties = properties or {} 

64 

65 # Call parent class constructor 

66 super().__init__(properties) 

67 self.locals_var_dict = locals().copy() 

68 

69 # Input/Output files 

70 self.io_dict = { 

71 'in': { 

72 'input_phaseC_path': input_phaseC_path, 

73 'input_phaseW_path': input_phaseW_path 

74 }, 

75 'out': { 

76 'output_csv_path': output_csv_path, 

77 'output_jpg_path': output_jpg_path 

78 } 

79 } 

80 

81 self.properties = properties 

82 self.sequence = properties.get("sequence") 

83 self.stride = properties.get( 

84 "stride", 1000) 

85 self.seqpos = properties.get( 

86 "seqpos", None) 

87 

88 # Check the properties 

89 self.check_properties(properties) 

90 self.check_arguments() 

91 

92 @launchlogger 

93 def launch(self) -> int: 

94 """Execute the :class:`Puckering <backbone.puckering.Puckering>` object.""" 

95 

96 # Setup Biobb 

97 if self.check_restart(): 

98 return 0 

99 self.stage_files() 

100 

101 # check sequence 

102 if self.sequence is None or len(self.sequence) < 2: 

103 raise ValueError("sequence is null or too short!") 

104 

105 # check seqpos 

106 if self.seqpos is not None: 

107 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

108 raise ValueError( 

109 f"seqpos values must be between 1 and {len(self.sequence) - 2}") 

110 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

111 raise ValueError( 

112 "seqpos must be a list of at least two integers") 

113 

114 # Creating temporary folder 

115 self.tmp_folder = fu.create_unique_dir(prefix="backbone_") 

116 fu.log('Creating %s temporary folder' % self.tmp_folder, self.out_log) 

117 

118 # Copy input_file_path1 to temporary folder 

119 shutil.copy(self.io_dict['in']['input_phaseC_path'], self.tmp_folder) 

120 shutil.copy(self.io_dict['in']['input_phaseW_path'], self.tmp_folder) 

121 

122 # read input files 

123 phaseC = read_series( 

124 self.io_dict['in']['input_phaseC_path'], 

125 usecols=self.seqpos) 

126 phaseW = read_series( 

127 self.io_dict['in']['input_phaseW_path'], 

128 usecols=self.seqpos) 

129 

130 # fix angle range so its not negative 

131 phaseC = self.fix_angles(phaseC) 

132 phaseW = self.fix_angles(phaseW) 

133 

134 # calculate difference between epsil and zeta parameters 

135 xlabels = self.get_xlabels( 

136 self.sequence, 

137 inverse_complement(self.sequence)) 

138 Npop, Epop, Wpop, Spop = self.check_puckering(phaseC, phaseW) 

139 

140 # save plot 

141 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

142 axs.bar( 

143 range(len(xlabels)), 

144 Npop, 

145 label="North") 

146 axs.bar( 

147 range(len(xlabels)), 

148 Epop, 

149 bottom=Npop, 

150 label="East") 

151 axs.bar( 

152 range(len(xlabels)), 

153 Spop, 

154 bottom=Npop+Epop, 

155 label="South") 

156 axs.bar( 

157 range(len(xlabels)), 

158 Wpop, 

159 bottom=Npop+Epop+Spop, 

160 label="West") 

161 # empty bar to divide both sequences 

162 axs.bar( 

163 [len(phaseC.columns)], 

164 [100], 

165 color='white', 

166 label=None) 

167 axs.legend() 

168 axs.set_xticks(range(len(xlabels))) 

169 axs.set_xticklabels(xlabels, rotation=90) 

170 axs.set_xlabel("Nucleotide Sequence") 

171 axs.set_ylabel("Puckering (%)") 

172 axs.set_title("Nucleotide parameter: Puckering") 

173 fig.savefig( 

174 self.io_dict['out']['output_jpg_path'], 

175 format="jpg") 

176 

177 # save table 

178 populations = pd.DataFrame({ 

179 "Nucleotide": xlabels, 

180 "North": Npop, 

181 "East": Epop, 

182 "West": Wpop, 

183 "South": Spop}) 

184 populations.to_csv( 

185 self.io_dict['out']['output_csv_path'], 

186 index=False) 

187 

188 plt.close() 

189 

190 # Remove temporary file(s) 

191 self.tmp_files.extend([ 

192 self.stage_io_dict.get("unique_dir"), 

193 self.tmp_folder 

194 ]) 

195 self.remove_tmp_files() 

196 

197 self.check_arguments(output_files_created=True, raise_exception=False) 

198 

199 return 0 

200 

201 def get_xlabels(self, strand1, strand2): 

202 # get list of tetramers, except first and last two bases 

203 labelsW = list(strand1) 

204 labelsW[0] = f"{labelsW[0]}5\'" 

205 labelsW[-1] = f"{labelsW[-1]}3\'" 

206 labelsW = [ 

207 f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW)+1))] 

208 labelsC = list(strand2)[::-1] 

209 labelsC[0] = f"{labelsC[0]}5\'" 

210 labelsC[-1] = f"{labelsC[-1]}3\'" 

211 labelsC = [ 

212 f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

213 

214 if self.seqpos is not None: 

215 labelsC = [labelsC[i] for i in self.seqpos] 

216 labelsW = [labelsW[i] for i in self.seqpos] 

217 xlabels = labelsW + ['-'] + labelsC 

218 return xlabels 

219 

220 def check_puckering(self, phaseC, phaseW): 

221 separator_df = pd.DataFrame({"-": np.nan}, index=range(1, len(phaseC))) 

222 phase = pd.concat([ 

223 phaseW, 

224 separator_df, 

225 phaseC[phaseC.columns[::-1]]], 

226 axis=1) 

227 # phase.columns = columns 

228 

229 Npop = np.logical_or(phase > 315, phase < 45).mean() * 100 

230 Epop = np.logical_and(phase > 45, phase < 135).mean() * 100 

231 Wpop = np.logical_and(phase > 225, phase < 315).mean() * 100 

232 Spop = np.logical_and(phase > 135, phase < 225).mean() * 100 

233 return Npop, Epop, Wpop, Spop 

234 

235 def fix_angles(self, dataset): 

236 values = np.where(dataset < 0, dataset + 360, dataset) 

237 # values = np.where(values > 360, values - 360, values) 

238 dataset = pd.DataFrame(values) 

239 return dataset 

240 

241 

242def puckering( 

243 input_phaseC_path: str, input_phaseW_path: str, 

244 output_csv_path: str, output_jpg_path: str, 

245 properties: dict = None, **kwargs) -> int: 

246 """Create :class:`Puckering <dna.backbone.puckering.Puckering>` class and 

247 execute the: meth: `launch() <dna.backbone.puckering.Puckering.launch>` method. """ 

248 

249 return Puckering( 

250 input_phaseC_path=input_phaseC_path, 

251 input_phaseW_path=input_phaseW_path, 

252 output_csv_path=output_csv_path, 

253 output_jpg_path=output_jpg_path, 

254 properties=properties, **kwargs).launch() 

255 

256 

257def main(): 

258 """Command line execution of this building block. Please check the command line documentation.""" 

259 parser = argparse.ArgumentParser(description='Calculate North/East/West/South distribution of sugar puckering backbone torsions.', 

260 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

261 parser.add_argument('--config', required=False, help='Configuration file') 

262 

263 required_args = parser.add_argument_group('required arguments') 

264 required_args.add_argument('--input_phaseC_path', required=True, 

265 help='Helical parameter <alphaC> input ser file path. Accepted formats: ser.') 

266 required_args.add_argument('--input_phaseW_path', required=True, 

267 help='Helical parameter <alphaW> input ser file path. Accepted formats: ser.') 

268 required_args.add_argument('--output_csv_path', required=True, 

269 help='Path to output csv file. Accepted formats: csv.') 

270 required_args.add_argument('--output_jpg_path', required=True, 

271 help='Path to output jpg file. Accepted formats: jpg.') 

272 

273 args = parser.parse_args() 

274 args.config = args.config or "{}" 

275 properties = settings.ConfReader(config=args.config).get_prop_dic() 

276 

277 puckering( 

278 input_phaseC_path=args.input_phaseC_path, 

279 input_phaseW_path=args.input_phaseW_path, 

280 output_csv_path=args.output_csv_path, 

281 output_jpg_path=args.output_jpg_path, 

282 properties=properties) 

283 

284 

285if __name__ == '__main__': 

286 main()