Coverage for biobb_godmd / godmd / godmd_prep.py: 93%

150 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-22 11:45 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the GOdMDPrep class and the command line interface.""" 

4from typing import Optional 

5from pathlib import Path 

6from biobb_common.generic.biobb_object import BiobbObject 

7from biobb_common.tools import file_utils as fu 

8from biobb_common.tools.file_utils import launchlogger 

9from biobb_godmd.godmd.common import check_input_path, check_output_path 

10 

11 

12class GOdMDPrep(BiobbObject): 

13 """ 

14 | biobb_godmd GOdMDPrep 

15 | Helper bb to prepare inputs for the `GOdMD tool <http://mmb.irbbarcelona.org/GOdMD/>`_ module. 

16 | Prepares input files for the GOdMD tool. 

17 

18 Args: 

19 input_pdb_orig_path (str): Input PDB file to be used as origin in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/1ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476). 

20 input_pdb_target_path (str): Input PDB file to be used as target in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/4ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476). 

21 output_aln_orig_path (str): Output GOdMD alignment file corresponding to the origin structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/1ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330). 

22 output_aln_target_path (str): Output GOdMD alignment file corresponding to the target structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/4ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330). 

23 properties (dict - Python dictionary object containing the tool parameters, not input/output files): 

24 * **gapopen** (*float*) - (12.0) Standard gap penalty: score taken away when a gap is created. 

25 * **gapextend** (*float*) - (2.0) Penalty added to the standard gap penalty for each base or residue in the gap. 

26 * **datafile** (*str*) - ("EPAM250") Scoring matrix file used when comparing sequences. 

27 * **binary_path** (*str*) - ("water") Binary path. 

28 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

29 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

30 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

31 

32 Examples: 

33 This is a use example of how to use the building block from Python:: 

34 

35 from biobb_godmd.godmd.godmd_prep import godmd_prep 

36 prop = { 

37 'gapopen': 10.0, 

38 'gapextend': 2.0 

39 } 

40 godmd_prep( input_pdb_orig_path='/path/to/input_orig.pdb', 

41 input_pdb_target_path='/path/to/input_target.pdb', 

42 output_aln_orig_path='/path/to/orig.aln', 

43 output_aln_target_path='/path/to/target.aln', 

44 properties=prop) 

45 

46 Info: 

47 * wrapped_software: 

48 * name: In house 

49 * license: Apache-2.0 

50 * ontology: 

51 * name: EDAM 

52 * schema: http://edamontology.org/EDAM.owl 

53 

54 """ 

55 

56 def __init__(self, input_pdb_orig_path: str, input_pdb_target_path: str, 

57 output_aln_orig_path: str, output_aln_target_path: str, 

58 properties: Optional[dict] = None, **kwargs) -> None: 

59 

60 properties = properties or {} 

61 

62 # Call parent class constructor 

63 super().__init__(properties) 

64 self.locals_var_dict = locals().copy() 

65 

66 self.AA_TRANSLATOR = {'GLY': 'G', 

67 'ALA': 'A', 

68 'VAL': 'V', 

69 'LEU': 'L', 

70 'ILE': 'I', 

71 'MET': 'M', 

72 'PHE': 'F', 

73 'TRP': 'W', 

74 'PRO': 'P', 

75 'SER': 'S', 

76 'THR': 'T', 

77 'CYS': 'C', 

78 'CYX': 'C', 

79 'CYM': 'C', 

80 'TYR': 'Y', 

81 'TYM': 'Y', 

82 'ASN': 'N', 

83 'GLN': 'Q', 

84 'ASP': 'D', 

85 'ASH': 'D', 

86 'GLU': 'E', 

87 'GLH': 'E', 

88 'LYS': 'K', 

89 'LYN': 'K', 

90 'ARG': 'R', 

91 'ARN': 'R', 

92 'HIS': 'H', 

93 'HIE': 'H', 

94 'HID': 'H', 

95 'HIP': 'H'} 

96 

97 # Input/Output files 

98 self.io_dict = { 

99 'in': {'input_pdb_orig_path': input_pdb_orig_path, 

100 'input_pdb_target_path': input_pdb_target_path}, 

101 'out': {'output_aln_orig_path': output_aln_orig_path, 

102 'output_aln_target_path': output_aln_target_path} 

103 } 

104 

105 # Properties specific for BB 

106 self.properties = properties 

107 self.gapopen = properties.get('gapopen', 12.0) 

108 self.gapextend = properties.get('gapextend', 2.0) 

109 self.datafile = properties.get('datafile', "EPAM250") 

110 self.binary_path = properties.get('binary_path', "water") 

111 

112 # Check the properties 

113 self.check_properties(properties) 

114 self.check_arguments() 

115 

116 def check_data_params(self, out_log, out_err): 

117 """ Checks input/output paths correctness """ 

118 

119 # Check input(s) 

120 self.io_dict["in"]["input_pdb_orig_path"] = check_input_path(self.io_dict["in"]["input_pdb_orig_path"], "input_pdb_orig_path", False, out_log, self.__class__.__name__) 

121 self.io_dict["in"]["input_pdb_target_path"] = check_input_path(self.io_dict["in"]["input_pdb_target_path"], "input_pdb_target_path", False, out_log, self.__class__.__name__) 

122 

123 # Check output(s) 

124 self.io_dict["out"]["output_aln_orig_path"] = check_output_path(self.io_dict["out"]["output_aln_orig_path"], "output_aln_orig_path", False, out_log, self.__class__.__name__) 

125 self.io_dict["out"]["output_aln_target_path"] = check_output_path(self.io_dict["out"]["output_aln_target_path"], "output_aln_target_path", False, out_log, self.__class__.__name__) 

126 

127 def extract_sequence(self, pdb): 

128 ''' 

129 Parses a PDB file to retrieve the sequence of a certain chain. 

130 ''' 

131 seq = '' 

132 resids = [] 

133 with open(pdb) as file: 

134 for line in file: 

135 line = line.rstrip('\n') 

136 record = line[:6].replace(" ", "") 

137 if record != 'ATOM': 

138 continue 

139 atomname = line[12:16] 

140 atomname = atomname.replace(" ", "") 

141 if atomname != 'CA': 

142 continue 

143 alternate = line[16] 

144 if alternate != ' ': 

145 if alternate != 'A': 

146 continue # if alt loc, only A-labeled residues 

147 resname = line[17:20].replace(" ", "") 

148 if resname not in self.AA_TRANSLATOR: 

149 continue 

150 resnum = line[22:26].replace(" ", "") 

151 icode = line[26:27] 

152 seq += self.AA_TRANSLATOR[resname] 

153 resids += [(' ', int(resnum), icode)] # BioPython's standard PDB atom id (' ',resid,insertioncode) (e.g. (' ',40,'B')) 

154 

155 return seq, resids 

156 

157 def retrieve_alignment(self, waterFile, resids1, resids2): 

158 ''' 

159 Gets the FASTA sequence of two PDB structures and a list of their residues in Biopython residue format. 

160 Opens a file containing a local sequence alignment (Water program of EMBOSS package) between those two PDB structures. 

161 Returns the sequence identity and the pairs of residues of the alignment. 

162 ''' 

163 # Parsing Water alignment file 

164 is_alignment = 0 

165 is_qseq = 0 

166 is_sseq = 0 

167 qseq = "" 

168 sseq = "" 

169 ident = "" 

170 next_seq = 0 

171 qstart = "" 

172 sstart = "" 

173 with open(waterFile) as file: 

174 for line in file: 

175 line = line.rstrip("\n") 

176 if is_alignment == 0: 

177 if not line.startswith(">>#1"): 

178 continue 

179 else: 

180 is_alignment = 1 

181 else: 

182 if line.startswith("; sw_ident:"): 

183 ident = line[12:] 

184 elif line.startswith("; al_start:"): 

185 if next_seq == 0: 

186 qstart = line[12:] 

187 else: 

188 sstart = line[12:] 

189 elif line.startswith("; al_display_start:"): 

190 if next_seq == 0: 

191 is_qseq = 1 

192 else: 

193 is_sseq = 1 

194 else: 

195 if is_qseq == 1: 

196 if not line.startswith(">"): 

197 qseq = qseq + line 

198 else: 

199 is_qseq = 0 

200 next_seq = 1 

201 if is_sseq == 1: 

202 if line != "": 

203 sseq = sseq + line 

204 else: 

205 is_sseq = 0 

206 

207 # Get the Sequence Identity of the alignment 

208 seq_id = float(ident) 

209 

210 # Get the residues of the alignment. 

211 # -2 is applied because in the first iteration of next loop all values will always be increased by 1 (local alignment) 

212 # and we also need to convert the number into a python index to retrieve the proper token in the list 

213 resid_pairs = [] 

214 idx1 = int(qstart)-2 

215 idx2 = int(sstart)-2 

216 for i in range(len(qseq)): # it could be also len(sseq) 

217 if qseq[i] != '-': 

218 idx1 += 1 

219 if sseq[i] != '-': 

220 idx2 += 1 

221 if qseq[i] == '-' or sseq[i] == '-': 

222 continue 

223 resid_pairs += [(resids1[idx1], resids2[idx2])] 

224 # Check contents of the residues of the alignment 

225 if len(resid_pairs) == 0: # Alignment file was empty or there was no possible matching residues between the PDBs 

226 fu.log('Alignment file was empty or there was no possible matching residues between the PDBs' % self.stage_io_dict["unique_dir"], self.out_log) 

227 return False, False 

228 else: 

229 return seq_id, resid_pairs 

230 

231 @launchlogger 

232 def launch(self): 

233 """Launches the execution of the GOdMDPrep module.""" 

234 

235 # check input/output paths and parameters 

236 self.check_data_params(self.out_log, self.err_log) 

237 

238 # Setup Biobb 

239 if self.check_restart(): 

240 return 0 

241 self.stage_files() 

242 

243 # Starting the work... 

244 # Parsing the input PDB files 

245 pdb1 = self.io_dict["in"]["input_pdb_orig_path"] 

246 pdb2 = self.io_dict["in"]["input_pdb_target_path"] 

247 

248 # Generate sequence of first PDB 

249 seq1, resids1 = self.extract_sequence(pdb1) 

250 

251 # Generate sequence of second PDB 

252 seq2, resids2 = self.extract_sequence(pdb2) 

253 

254 if len(seq1) < 50: 

255 fu.log('WARNING: Short sequence (ORIGIN)' % self.stage_io_dict["unique_dir"], self.out_log) 

256 if len(seq2) < 50: 

257 fu.log('WARNING: Short sequence (TARGET)' % self.stage_io_dict["unique_dir"], self.out_log) 

258 

259 # Produce FASTA files 

260 name1 = "fasta1.fa" 

261 name2 = "fasta2.fa" 

262 fasta1Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name1)) 

263 fasta1File = open(fasta1Filename, "w") 

264 fasta1File.write(">"+pdb1+"\n"+seq1) 

265 fasta1File.close() 

266 fasta2Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name2)) 

267 fasta2File = open(fasta2Filename, "w") 

268 fasta2File.write(">"+pdb2+"\n"+seq2) 

269 fasta2File.close() 

270 

271 waterFilename = str(Path(self.stage_io_dict["unique_dir"]).joinpath("water_align.out")) 

272 

273 # water -auto -outfile=water_align.out -asequence=1ake.chains.nolig.pdb.fa 

274 # -bsequence=4ake.chains.pdb.fa -gapopen=12 -gapextend=2 

275 # -datafile=EPAM250 -aformat=markx10 

276 

277 # Command line 

278 self.cmd = [self.binary_path, 

279 '-auto', 

280 '-outfile', waterFilename, 

281 '-asequence', fasta1Filename, 

282 '-bsequence', fasta2Filename, 

283 '-gapopen', str(self.gapopen), 

284 '-gapextend', str(self.gapextend), 

285 '-datafile', self.datafile, 

286 '-aformat', "markx10" 

287 ] 

288 

289 # Run Biobb block 

290 self.run_biobb() 

291 

292 # Starting post-alignment process: generating .aln files 

293 

294 # Retrieve sequence identity of the pair, based on local sequence alignment 

295 # previously computed, and generate the pairs of residues from both structres 

296 # to drive the superimposition 

297 seq_id, resid_pairs = self.retrieve_alignment(waterFilename, resids1, resids2) 

298 

299 aln1Filename = self.io_dict["out"]["output_aln_orig_path"] 

300 aln1File = open(aln1Filename, 'w') 

301 for pair in resid_pairs: 

302 aln1File.write("%s%s\n" % (pair[0][1], pair[0][2].replace(" ", ""))) 

303 aln1File.close() 

304 

305 aln2Filename = self.io_dict["out"]["output_aln_target_path"] 

306 aln2File = open(aln2Filename, 'w') 

307 for pair in resid_pairs: 

308 aln2File.write("%s%s\n" % (pair[1][1], pair[1][2].replace(" ", ""))) 

309 aln2File.close() 

310 

311 # Copy files to host 

312 self.copy_to_host() 

313 

314 # Remove temporary folder(s) 

315 self.remove_tmp_files() 

316 

317 self.check_arguments(output_files_created=True, raise_exception=False) 

318 

319 return self.return_code 

320 

321 

322def godmd_prep(input_pdb_orig_path: str, input_pdb_target_path: str, 

323 output_aln_orig_path: str, output_aln_target_path: str, 

324 properties: Optional[dict] = None, **kwargs) -> int: 

325 """Create :class:`GOdMDPrep <godmd.godmd_prep.GOdMDPrep>`godmd.godmd_prep.GOdMDPrep class and 

326 execute :meth:`launch() <godmd.godmd_prep.GOdMDPrep.launch>` method""" 

327 return GOdMDPrep(**dict(locals())).launch() 

328 

329 

330godmd_prep.__doc__ = GOdMDPrep.__doc__ 

331main = GOdMDPrep.get_main(godmd_prep, "Prepares input files for the GOdMD tool.") 

332 

333if __name__ == '__main__': 

334 main()