Coverage for biobb_godmd/godmd/godmd_prep.py: 87%

163 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-25 07:50 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the GOdMDPrep class and the command line interface.""" 

4import argparse 

5from typing import Optional 

6from pathlib import Path 

7from biobb_common.generic.biobb_object import BiobbObject 

8from biobb_common.configuration import settings 

9from biobb_common.tools import file_utils as fu 

10from biobb_common.tools.file_utils import launchlogger 

11from biobb_godmd.godmd.common import check_input_path, check_output_path 

12 

13 

14class GOdMDPrep(BiobbObject): 

15 """ 

16 | biobb_godmd GOdMDPrep 

17 | Helper bb to prepare inputs for the `GOdMD tool <http://mmb.irbbarcelona.org/GOdMD/>`_ module. 

18 | Prepares input files for the GOdMD tool. 

19 

20 Args: 

21 input_pdb_orig_path (str): Input PDB file to be used as origin in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476). 

22 input_pdb_target_path (str): Input PDB file to be used as target in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476). 

23 output_aln_orig_path (str): Output GOdMD alignment file corresponding to the origin structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330). 

24 output_aln_target_path (str): Output GOdMD alignment file corresponding to the target structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330). 

25 properties (dict - Python dictionary object containing the tool parameters, not input/output files): 

26 * **gapopen** (*float*) - (12.0) Standard gap penalty: score taken away when a gap is created. 

27 * **gapextend** (*float*) - (2.0) Penalty added to the standard gap penalty for each base or residue in the gap. 

28 * **datafile** (*str*) - ("EPAM250") Scoring matrix file used when comparing sequences. 

29 * **binary_path** (*str*) - ("water") Binary path. 

30 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

31 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

32 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

33 

34 Examples: 

35 This is a use example of how to use the building block from Python:: 

36 

37 from biobb_godmd.godmd.godmd_prep import godmd_prep 

38 prop = { 

39 'gapopen': 10.0, 

40 'gapextend': 2.0 

41 } 

42 godmd_prep( input_pdb_orig_path='/path/to/input_orig.pdb', 

43 input_pdb_target_path='/path/to/input_target.pdb', 

44 output_aln_orig_path='/path/to/orig.aln', 

45 output_aln_target_path='/path/to/target.aln', 

46 properties=prop) 

47 

48 Info: 

49 * wrapped_software: 

50 * name: In house 

51 * license: Apache-2.0 

52 * ontology: 

53 * name: EDAM 

54 * schema: http://edamontology.org/EDAM.owl 

55 

56 """ 

57 

58 def __init__(self, input_pdb_orig_path: str, input_pdb_target_path: str, 

59 output_aln_orig_path: str, output_aln_target_path: str, 

60 properties: Optional[dict] = None, **kwargs) -> None: 

61 

62 properties = properties or {} 

63 

64 # Call parent class constructor 

65 super().__init__(properties) 

66 self.locals_var_dict = locals().copy() 

67 

68 self.AA_TRANSLATOR = {'GLY': 'G', 

69 'ALA': 'A', 

70 'VAL': 'V', 

71 'LEU': 'L', 

72 'ILE': 'I', 

73 'MET': 'M', 

74 'PHE': 'F', 

75 'TRP': 'W', 

76 'PRO': 'P', 

77 'SER': 'S', 

78 'THR': 'T', 

79 'CYS': 'C', 

80 'CYX': 'C', 

81 'CYM': 'C', 

82 'TYR': 'Y', 

83 'TYM': 'Y', 

84 'ASN': 'N', 

85 'GLN': 'Q', 

86 'ASP': 'D', 

87 'ASH': 'D', 

88 'GLU': 'E', 

89 'GLH': 'E', 

90 'LYS': 'K', 

91 'LYN': 'K', 

92 'ARG': 'R', 

93 'ARN': 'R', 

94 'HIS': 'H', 

95 'HIE': 'H', 

96 'HID': 'H', 

97 'HIP': 'H'} 

98 

99 # Input/Output files 

100 self.io_dict = { 

101 'in': {'input_pdb_orig_path': input_pdb_orig_path, 

102 'input_pdb_target_path': input_pdb_target_path}, 

103 'out': {'output_aln_orig_path': output_aln_orig_path, 

104 'output_aln_target_path': output_aln_target_path} 

105 } 

106 

107 # Properties specific for BB 

108 self.properties = properties 

109 self.gapopen = properties.get('gapopen', 12.0) 

110 self.gapextend = properties.get('gapextend', 2.0) 

111 self.datafile = properties.get('datafile', "EPAM250") 

112 self.binary_path = properties.get('binary_path', "water") 

113 

114 # Check the properties 

115 self.check_properties(properties) 

116 self.check_arguments() 

117 

118 def check_data_params(self, out_log, out_err): 

119 """ Checks input/output paths correctness """ 

120 

121 # Check input(s) 

122 self.io_dict["in"]["input_pdb_orig_path"] = check_input_path(self.io_dict["in"]["input_pdb_orig_path"], "input_pdb_orig_path", False, out_log, self.__class__.__name__) 

123 self.io_dict["in"]["input_pdb_target_path"] = check_input_path(self.io_dict["in"]["input_pdb_target_path"], "input_pdb_target_path", False, out_log, self.__class__.__name__) 

124 

125 # Check output(s) 

126 self.io_dict["out"]["output_aln_orig_path"] = check_output_path(self.io_dict["out"]["output_aln_orig_path"], "output_aln_orig_path", False, out_log, self.__class__.__name__) 

127 self.io_dict["out"]["output_aln_target_path"] = check_output_path(self.io_dict["out"]["output_aln_target_path"], "output_aln_target_path", False, out_log, self.__class__.__name__) 

128 

129 def extract_sequence(self, pdb): 

130 ''' 

131 Parses a PDB file to retrieve the sequence of a certain chain. 

132 ''' 

133 seq = '' 

134 resids = [] 

135 with open(pdb) as file: 

136 for line in file: 

137 line = line.rstrip('\n') 

138 record = line[:6].replace(" ", "") 

139 if record != 'ATOM': 

140 continue 

141 atomname = line[12:16] 

142 atomname = atomname.replace(" ", "") 

143 if atomname != 'CA': 

144 continue 

145 alternate = line[16] 

146 if alternate != ' ': 

147 if alternate != 'A': 

148 continue # if alt loc, only A-labeled residues 

149 resname = line[17:20].replace(" ", "") 

150 if resname not in self.AA_TRANSLATOR: 

151 continue 

152 resnum = line[22:26].replace(" ", "") 

153 icode = line[26:27] 

154 seq += self.AA_TRANSLATOR[resname] 

155 resids += [(' ', int(resnum), icode)] # BioPython's standard PDB atom id (' ',resid,insertioncode) (e.g. (' ',40,'B')) 

156 

157 return seq, resids 

158 

159 def retrieve_alignment(self, waterFile, resids1, resids2): 

160 ''' 

161 Gets the FASTA sequence of two PDB structures and a list of their residues in Biopython residue format. 

162 Opens a file containing a local sequence alignment (Water program of EMBOSS package) between those two PDB structures. 

163 Returns the sequence identity and the pairs of residues of the alignment. 

164 ''' 

165 # Parsing Water alignment file 

166 is_alignment = 0 

167 is_qseq = 0 

168 is_sseq = 0 

169 qseq = "" 

170 sseq = "" 

171 ident = "" 

172 next_seq = 0 

173 qstart = "" 

174 sstart = "" 

175 with open(waterFile) as file: 

176 for line in file: 

177 line = line.rstrip("\n") 

178 if is_alignment == 0: 

179 if not line.startswith(">>#1"): 

180 continue 

181 else: 

182 is_alignment = 1 

183 else: 

184 if line.startswith("; sw_ident:"): 

185 ident = line[12:] 

186 elif line.startswith("; al_start:"): 

187 if next_seq == 0: 

188 qstart = line[12:] 

189 else: 

190 sstart = line[12:] 

191 elif line.startswith("; al_display_start:"): 

192 if next_seq == 0: 

193 is_qseq = 1 

194 else: 

195 is_sseq = 1 

196 else: 

197 if is_qseq == 1: 

198 if not line.startswith(">"): 

199 qseq = qseq + line 

200 else: 

201 is_qseq = 0 

202 next_seq = 1 

203 if is_sseq == 1: 

204 if line != "": 

205 sseq = sseq + line 

206 else: 

207 is_sseq = 0 

208 

209 # Get the Sequence Identity of the alignment 

210 seq_id = float(ident) 

211 

212 # Get the residues of the alignment. 

213 # -2 is applied because in the first iteration of next loop all values will always be increased by 1 (local alignment) 

214 # and we also need to convert the number into a python index to retrieve the proper token in the list 

215 resid_pairs = [] 

216 idx1 = int(qstart)-2 

217 idx2 = int(sstart)-2 

218 for i in range(len(qseq)): # it could be also len(sseq) 

219 if qseq[i] != '-': 

220 idx1 += 1 

221 if sseq[i] != '-': 

222 idx2 += 1 

223 if qseq[i] == '-' or sseq[i] == '-': 

224 continue 

225 resid_pairs += [(resids1[idx1], resids2[idx2])] 

226 # Check contents of the residues of the alignment 

227 if len(resid_pairs) == 0: # Alignment file was empty or there was no possible matching residues between the PDBs 

228 fu.log('Alignment file was empty or there was no possible matching residues between the PDBs' % self.stage_io_dict["unique_dir"], self.out_log) 

229 return False, False 

230 else: 

231 return seq_id, resid_pairs 

232 

233 @launchlogger 

234 def launch(self): 

235 """Launches the execution of the GOdMDPrep module.""" 

236 

237 # check input/output paths and parameters 

238 self.check_data_params(self.out_log, self.err_log) 

239 

240 # Setup Biobb 

241 if self.check_restart(): 

242 return 0 

243 self.stage_files() 

244 

245 # Starting the work... 

246 # Parsing the input PDB files 

247 pdb1 = self.io_dict["in"]["input_pdb_orig_path"] 

248 pdb2 = self.io_dict["in"]["input_pdb_target_path"] 

249 

250 # Generate sequence of first PDB 

251 seq1, resids1 = self.extract_sequence(pdb1) 

252 

253 # Generate sequence of second PDB 

254 seq2, resids2 = self.extract_sequence(pdb2) 

255 

256 if len(seq1) < 50: 

257 fu.log('WARNING: Short sequence (ORIGIN)' % self.stage_io_dict["unique_dir"], self.out_log) 

258 if len(seq2) < 50: 

259 fu.log('WARNING: Short sequence (TARGET)' % self.stage_io_dict["unique_dir"], self.out_log) 

260 

261 # Produce FASTA files 

262 name1 = "fasta1.fa" 

263 name2 = "fasta2.fa" 

264 fasta1Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name1)) 

265 fasta1File = open(fasta1Filename, "w") 

266 fasta1File.write(">"+pdb1+"\n"+seq1) 

267 fasta1File.close() 

268 fasta2Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name2)) 

269 fasta2File = open(fasta2Filename, "w") 

270 fasta2File.write(">"+pdb2+"\n"+seq2) 

271 fasta2File.close() 

272 

273 waterFilename = str(Path(self.stage_io_dict["unique_dir"]).joinpath("water_align.out")) 

274 

275 # water -auto -outfile=water_align.out -asequence=1ake.chains.nolig.pdb.fa 

276 # -bsequence=4ake.chains.pdb.fa -gapopen=12 -gapextend=2 

277 # -datafile=EPAM250 -aformat=markx10 

278 

279 # Command line 

280 self.cmd = [self.binary_path, 

281 '-auto', 

282 '-outfile', waterFilename, 

283 '-asequence', fasta1Filename, 

284 '-bsequence', fasta2Filename, 

285 '-gapopen', str(self.gapopen), 

286 '-gapextend', str(self.gapextend), 

287 '-datafile', self.datafile, 

288 '-aformat', "markx10" 

289 ] 

290 

291 # Run Biobb block 

292 self.run_biobb() 

293 

294 # Starting post-alignment process: generating .aln files 

295 

296 # Retrieve sequence identity of the pair, based on local sequence alignment 

297 # previously computed, and generate the pairs of residues from both structres 

298 # to drive the superimposition 

299 seq_id, resid_pairs = self.retrieve_alignment(waterFilename, resids1, resids2) 

300 

301 aln1Filename = self.io_dict["out"]["output_aln_orig_path"] 

302 aln1File = open(aln1Filename, 'w') 

303 for pair in resid_pairs: 

304 aln1File.write("%s%s\n" % (pair[0][1], pair[0][2].replace(" ", ""))) 

305 aln1File.close() 

306 

307 aln2Filename = self.io_dict["out"]["output_aln_target_path"] 

308 aln2File = open(aln2Filename, 'w') 

309 for pair in resid_pairs: 

310 aln2File.write("%s%s\n" % (pair[1][1], pair[1][2].replace(" ", ""))) 

311 aln2File.close() 

312 

313 # Copy files to host 

314 self.copy_to_host() 

315 

316 # remove temporary folder(s) 

317 # self.tmp_files.extend([ 

318 # self.stage_io_dict.get("unique_dir", "") 

319 # ]) 

320 self.remove_tmp_files() 

321 

322 self.check_arguments(output_files_created=True, raise_exception=False) 

323 

324 return self.return_code 

325 

326 

327def godmd_prep(input_pdb_orig_path: str, input_pdb_target_path: str, 

328 output_aln_orig_path: str, output_aln_target_path: str, 

329 properties: Optional[dict] = None, **kwargs) -> int: 

330 """Create :class:`GOdMDPrep <godmd.godmd_prep.GOdMDPrep>`godmd.godmd_prep.GOdMDPrep class and 

331 execute :meth:`launch() <godmd.godmd_prep.GOdMDPrep.launch>` method""" 

332 

333 return GOdMDPrep(input_pdb_orig_path=input_pdb_orig_path, 

334 input_pdb_target_path=input_pdb_target_path, 

335 output_aln_orig_path=output_aln_orig_path, 

336 output_aln_target_path=output_aln_target_path, 

337 properties=properties).launch() 

338 

339 godmd_prep.__doc__ = GOdMDPrep.__doc__ 

340 

341 

342def main(): 

343 parser = argparse.ArgumentParser(description='Prepares input files for the GOdMD tool.', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

344 parser.add_argument('--config', required=False, help='Configuration file') 

345 

346 # Specific args 

347 required_args = parser.add_argument_group('required arguments') 

348 required_args.add_argument('--input_pdb_orig_path', required=True, help='Input PDB file to be used as origin in the conformational transition. Accepted formats: pdb.') 

349 required_args.add_argument('--input_pdb_target_path', required=True, help='Input PDB file to be used as target in the conformational transition. Accepted formats: pdb.') 

350 required_args.add_argument('--output_aln_orig_path', required=True, help='Output GOdMD alignment file corresponding to the origin structure of the conformational transition. Accepted formats: aln, txt.') 

351 required_args.add_argument('--output_aln_target_path', required=True, help='Output GOdMD alignment file corresponding to the target structure of the conformational transition. Accepted formats: aln, txt.') 

352 

353 args = parser.parse_args() 

354 # config = args.config if args.config else None 

355 args.config = args.config or "{}" 

356 # properties = settings.ConfReader(config=config).get_prop_dic() 

357 properties = settings.ConfReader(config=args.config).get_prop_dic() 

358 

359 # Specific call 

360 godmd_prep(input_pdb_orig_path=args.input_pdb_orig_path, 

361 input_pdb_target_path=args.input_pdb_target_path, 

362 output_aln_orig_path=args.output_aln_orig_path, 

363 output_aln_target_path=args.output_aln_target_path, 

364 properties=properties) 

365 

366 

367if __name__ == '__main__': 

368 main()