Coverage for biobb_vs / utils / bindingsite.py: 85%

164 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-22 13:24 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the BindingSite class and the command line interface.""" 

4import re 

5import warnings 

6from pathlib import PurePath 

7from typing import Optional 

8from Bio import BiopythonDeprecationWarning 

9from biobb_common.generic.biobb_object import BiobbObject 

10from biobb_common.tools import file_utils as fu 

11from biobb_common.tools.file_utils import launchlogger 

12 

13from biobb_vs.utils.common import ( 

14 align_sequences, 

15 calculate_alignment_identity, 

16 check_input_path, 

17 check_output_path, 

18 get_ligand_residues, 

19 get_pdb_sequence, 

20 get_residue_by_id, 

21) 

22 

23with warnings.catch_warnings(): 

24 warnings.simplefilter("ignore", BiopythonDeprecationWarning) 

25 # try: 

26 # import Bio.SubsMat.MatrixInfo 

27 # except ImportError: 

28 import Bio.Align.substitution_matrices 

29 import Bio.pairwise2 

30 import Bio.PDB 

31 

32 

33class BindingSite(BiobbObject): 

34 """ 

35 | biobb_vs BindingSite 

36 | This class finds the binding site of the input_pdb. 

37 | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster) 

38 

39 Args: 

40 input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476). 

41 input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987). 

42 output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476). 

43 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

44 * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one. 

45 * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom. 

46 * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered. 

47 * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS. 

48 * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty. 

49 * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty. 

50 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

51 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

52 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

53 

54 Examples: 

55 This is a use example of how to use the building block from Python:: 

56 

57 from biobb_vs.utils.bindingsite import bindingsite 

58 prop = { 

59 'ligand': 'PGA', 

60 'matrix_name': 'BLOSUM62', 

61 'gap_open': -10.0, 

62 'gap_extend': -0.5, 

63 'max_num_ligands': 15, 

64 'radius': 5 

65 } 

66 bindingsite(input_pdb_path='/path/to/myStructure.pdb', 

67 input_clusters_zip='/path/to/myCluster.zip', 

68 output_pdb_path='/path/to/newStructure.pdb', 

69 properties=prop) 

70 

71 Info: 

72 * wrapped_software: 

73 * name: In house using Biopython 

74 * version: >=1.76 

75 * license: Apache-2.0 

76 * ontology: 

77 * name: EDAM 

78 * schema: http://edamontology.org/EDAM.owl 

79 

80 """ 

81 

82 def __init__( 

83 self, 

84 input_pdb_path, 

85 input_clusters_zip, 

86 output_pdb_path, 

87 properties=None, 

88 **kwargs, 

89 ) -> None: 

90 properties = properties or {} 

91 

92 # Call parent class constructor 

93 super().__init__(properties) 

94 self.locals_var_dict = locals().copy() 

95 

96 # Input/Output files 

97 self.io_dict = { 

98 "in": { 

99 "input_pdb_path": input_pdb_path, 

100 "input_clusters_zip": input_clusters_zip, 

101 }, 

102 "out": {"output_pdb_path": output_pdb_path}, 

103 } 

104 

105 # Properties specific for BB 

106 self.ligand = properties.get("ligand", None) 

107 self.radius = float(properties.get("radius", 5.0)) 

108 self.max_num_ligands = properties.get("max_num_ligands", 15) 

109 self.matrix_name = properties.get("matrix_name", "BLOSUM62") 

110 self.gap_open = properties.get("gap_open", -10.0) 

111 self.gap_extend = properties.get("gap_extend", -0.5) 

112 self.properties = properties 

113 

114 # Check the properties 

115 self.check_properties(properties) 

116 self.check_arguments() 

117 

118 def check_data_params(self, out_log, err_log): 

119 """Checks all the input/output paths and parameters""" 

120 self.io_dict["in"]["input_pdb_path"] = check_input_path( 

121 self.io_dict["in"]["input_pdb_path"], 

122 "input_pdb_path", 

123 self.out_log, 

124 self.__class__.__name__, 

125 ) 

126 self.io_dict["in"]["input_clusters_zip"] = check_input_path( 

127 self.io_dict["in"]["input_clusters_zip"], 

128 "input_clusters_zip", 

129 self.out_log, 

130 self.__class__.__name__, 

131 ) 

132 self.io_dict["out"]["output_pdb_path"] = check_output_path( 

133 self.io_dict["out"]["output_pdb_path"], 

134 "output_pdb_path", 

135 False, 

136 self.out_log, 

137 self.__class__.__name__, 

138 ) 

139 

140 @launchlogger 

141 def launch(self) -> int: 

142 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object.""" 

143 

144 # check input/output paths and parameters 

145 self.check_data_params(self.out_log, self.err_log) 

146 

147 # Setup Biobb 

148 if self.check_restart(): 

149 return 0 

150 self.stage_files() 

151 

152 # Parse structure 

153 fu.log( 

154 "Loading input PDB structure %s" % (self.io_dict["in"]["input_pdb_path"]), 

155 self.out_log, 

156 self.global_log, 

157 ) 

158 structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name 

159 parser = Bio.PDB.PDBParser(QUIET=True) 

160 structPDB = parser.get_structure( 

161 structure_name, self.io_dict["in"]["input_pdb_path"] 

162 ) 

163 

164 if len(structPDB): 

165 structPDB = structPDB[0] 

166 

167 # Use only one chain 

168 n_chains = structPDB.get_list() 

169 if len(n_chains) != 1: 

170 fu.log( 

171 "More than one chain found in the input PDB structure. Using only the first chain to find the binding site", 

172 self.out_log, 

173 self.global_log, 

174 ) 

175 # get first chain in case there is more than one chain 

176 for struct_chain in structPDB.get_chains(): 

177 structPDB = struct_chain 

178 

179 # Get AA sequence 

180 structPDB_seq = get_pdb_sequence(structPDB) 

181 if len(structPDB_seq) == 0: 

182 fu.log( 

183 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?" 

184 % self.io_dict["in"]["input_pdb_path"], 

185 self.out_log, 

186 ) 

187 raise SystemExit( 

188 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?" 

189 % self.io_dict["in"]["input_pdb_path"] 

190 ) 

191 else: 

192 fu.log( 

193 "Found %s residues in %s" 

194 % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]), 

195 self.out_log, 

196 ) 

197 

198 # create temporary folder for decompressing the input_clusters_zip file 

199 unique_dir = PurePath(fu.create_unique_dir()) 

200 fu.log( 

201 "Creating %s temporary folder" % unique_dir, self.out_log, self.global_log 

202 ) 

203 

204 # decompress the input_clusters_zip file 

205 cluster_list = fu.unzip_list( 

206 zip_file=self.io_dict["in"]["input_clusters_zip"], 

207 dest_dir=str(unique_dir), 

208 out_log=self.out_log, 

209 ) 

210 

211 clusterPDB_ligands_aligned = [] 

212 clusterPDB_ligands_num = 0 

213 

214 fu.log("Iterating on all clusters:", self.out_log) 

215 

216 for idx, cluster_path in enumerate(cluster_list): 

217 cluster_name = PurePath(cluster_path).stem 

218 fu.log(" ", self.out_log) 

219 fu.log( 

220 "------------ Iteration #%s --------------" % (idx + 1), self.out_log 

221 ) 

222 fu.log("Cluster member: %s" % cluster_name, self.out_log) 

223 

224 # Load and Parse PDB 

225 clusterPDB = {} 

226 clusterPDB = parser.get_structure(cluster_name, cluster_path)[0] 

227 

228 # Use only the first chain 

229 for cluster_chain in clusterPDB.get_chains(): # type: ignore 

230 clusterPDB = cluster_chain 

231 

232 # Looking for ligands 

233 clusterPDB_ligands = get_ligand_residues(clusterPDB) 

234 if (len(clusterPDB_ligands)) == 0: 

235 fu.log( 

236 "No ligands found that could guide the binding site search. Ignoring this member: %s" 

237 % cluster_name, 

238 self.out_log, 

239 ) 

240 continue 

241 

242 # Selecting the largest ligand, if more than one 

243 lig_atoms_num = 0 

244 clusterPDB_ligand = {} 

245 if self.ligand: 

246 if self.ligand in [x.get_resname() for x in clusterPDB_ligands]: 

247 for lig in clusterPDB_ligands: 

248 if lig.get_resname() == self.ligand: 

249 clusterPDB_ligand = lig 

250 lig_atoms_num = len(lig.get_list()) 

251 fu.log( 

252 "Ligand found: %s (%s atoms)" 

253 % (lig.get_resname(), lig_atoms_num), 

254 self.out_log, 

255 ) 

256 else: 

257 fu.log( 

258 "Ligand %s not found in %s cluster member, skipping this cluster" 

259 % (self.ligand, cluster_name), 

260 self.out_log, 

261 ) 

262 continue 

263 else: 

264 if len(clusterPDB_ligands) > 1: 

265 for lig_res in clusterPDB_ligands: 

266 lig_res_atoms_num = len(lig_res.get_list()) 

267 fu.log( 

268 "Ligand found: %s (%s atoms)" 

269 % (lig_res.get_resname(), lig_res_atoms_num), 

270 self.out_log, 

271 ) 

272 if lig_res_atoms_num > lig_atoms_num: 

273 clusterPDB_ligand = lig_res 

274 lig_atoms_num = lig_res_atoms_num 

275 else: 

276 clusterPDB_ligand = clusterPDB_ligands[0] 

277 lig_atoms_num = len(clusterPDB_ligands[0].get_list()) 

278 

279 fu.log( 

280 "Member accepted. Valid ligand found: %s (%s atoms)" 

281 % (clusterPDB_ligand.get_resname(), lig_atoms_num), # type: ignore 

282 self.out_log, 

283 ) 

284 

285 # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues 

286 

287 # Get AA sequence 

288 clusterPDB_seq = get_pdb_sequence(clusterPDB) 

289 

290 # Pairwise align 

291 aln, residue_map = align_sequences( 

292 structPDB_seq, 

293 clusterPDB_seq, 

294 self.matrix_name, 

295 self.gap_open, 

296 self.gap_extend, 

297 ) 

298 fu.log( 

299 "Matching residues to input PDB structure. Alignment is:\n%s" 

300 % (aln[1]), 

301 self.out_log, 

302 ) 

303 

304 # Calculate (gapless) sequence identity 

305 seq_identity, gap_seq_identity = calculate_alignment_identity( 

306 aln[0], aln[1] 

307 ) 

308 fu.log("Sequence identity (%%): %s" % (seq_identity), self.out_log) 

309 fu.log("Gap less identity (%%): %s" % (gap_seq_identity), self.out_log) 

310 

311 # Selecting aligned CA atoms from first model, first chain 

312 

313 struct_atoms = [] 

314 cluster_atoms = [] 

315 

316 for struct_res in residue_map: 

317 try: 

318 cluster_atoms.append(clusterPDB[residue_map[struct_res]]["CA"]) 

319 struct_atoms.append(get_residue_by_id(structPDB, struct_res)["CA"]) 

320 except KeyError: 

321 fu.log( 

322 "Cannot find CA atom for residue %s (input PDB %s)" 

323 % ( 

324 get_residue_by_id(structPDB, struct_res).get_resname(), 

325 struct_res, 

326 ), 

327 self.out_log, 

328 ) 

329 pass 

330 

331 if len(cluster_atoms) == 0: 

332 fu.log( 

333 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member." 

334 % (cluster_name, structure_name), 

335 self.out_log, 

336 ) 

337 raise SystemExit( 

338 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member." 

339 % (cluster_name, structure_name) 

340 ) 

341 else: 

342 fu.log( 

343 "Superimposing %s aligned protein residues" % (len(cluster_atoms)), 

344 self.out_log, 

345 ) 

346 

347 # Align against input structure 

348 

349 si = Bio.PDB.Superimposer() 

350 si.set_atoms(struct_atoms, cluster_atoms) 

351 si.apply(clusterPDB.get_atoms()) # type: ignore 

352 fu.log("RMSD: %s" % (si.rms), self.out_log) 

353 

354 # Save transformed structure (and ligand) 

355 clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()] # type: ignore 

356 fu.log("Saving transformed ligand coordinates", self.out_log) 

357 

358 clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned) 

359 

360 # Stop after n accepted cluster members 

361 

362 clusterPDB_ligands_num += 1 

363 

364 if clusterPDB_ligands_num > self.max_num_ligands: 

365 break 

366 

367 fu.log(" ", self.out_log) 

368 fu.log("----------------------------------------", self.out_log) 

369 fu.log( 

370 "All transformed ligand coordinates saved, getting binding site residues", 

371 self.out_log, 

372 ) 

373 

374 # Select binding site atoms as those around cluster superimposed ligands 

375 

376 fu.log( 

377 "Defining binding site residues as those %sÅ around the %s cluster superimposed ligands" 

378 % (self.radius, clusterPDB_ligands_num), 

379 self.out_log, 

380 ) 

381 

382 # select Atoms from aligned ligands 

383 clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned] 

384 clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities( 

385 clusterPDB_ligands_aligned2, "A" 

386 ) 

387 

388 # select Atoms from input PDB structure 

389 structPDB_atoms = [atom for atom in structPDB.get_atoms()] 

390 

391 # compute neighbors for aligned ligands in the input PDB structure 

392 structPDB_bs_residues_raw = {} 

393 structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms) 

394 for ligand_atom in clusterPDB_ligands_aligned_atoms: 

395 # look for PDB atoms 5A around each ligand atom 

396 k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, "R") 

397 for k in k_l: 

398 structPDB_bs_residues_raw[k.get_id()] = k.get_full_id() 

399 

400 # Save binding site to PDB 

401 

402 io = Bio.PDB.PDBIO() 

403 fu.log( 

404 "Writing binding site residues into %s" 

405 % (self.io_dict["out"]["output_pdb_path"]), 

406 self.out_log, 

407 ) 

408 

409 # unselect input PDB atoms not in binding site 

410 structPDB_bs_atoms = 0 

411 p = re.compile("H_|W_|W") 

412 residue_ids_to_remove = [] 

413 for res in structPDB.get_residues(): 

414 if res.id not in structPDB_bs_residues_raw.keys(): 

415 # add residue to residue_ids_to_remove list 

416 residue_ids_to_remove.append(res.id) 

417 elif p.match(res.resname): 

418 # add residue to residue_ids_to_remove list 

419 residue_ids_to_remove.append(res.id) 

420 else: 

421 # this residue will be preserved 

422 structPDB_bs_atoms += len(res.get_list()) 

423 

424 # unselect input PDB atoms not in binding site 

425 for chain in structPDB: 

426 for idr in residue_ids_to_remove: 

427 chain.detach_child(idr) 

428 

429 # write PDB file 

430 io.set_structure(structPDB) 

431 io.save(self.io_dict["out"]["output_pdb_path"]) 

432 

433 # Copy files to host 

434 self.copy_to_host() 

435 

436 self.tmp_files.append(str(unique_dir)) 

437 self.remove_tmp_files() 

438 

439 self.check_arguments(output_files_created=True, raise_exception=False) 

440 

441 return 0 

442 

443 

444def bindingsite( 

445 input_pdb_path: str, 

446 input_clusters_zip: str, 

447 output_pdb_path: str, 

448 properties: Optional[dict] = None, 

449 **kwargs, 

450) -> int: 

451 """Create the :class:`BindingSite <utils.bindingsite.BindingSite>` class and 

452 execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method.""" 

453 return BindingSite(**dict(locals())).launch() 

454 

455 

456bindingsite.__doc__ = BindingSite.__doc__ 

457main = BindingSite.get_main(bindingsite, "Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)") 

458 

459 

460if __name__ == "__main__": 

461 main()