Coverage for biobb_vs/utils/bindingsite.py: 80%

176 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-20 11:08 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the BindingSite class and the command line interface.""" 

4 

5import argparse 

6import re 

7import warnings 

8from pathlib import PurePath 

9from typing import Optional 

10 

11from Bio import BiopythonDeprecationWarning 

12from biobb_common.configuration import settings 

13from biobb_common.generic.biobb_object import BiobbObject 

14from biobb_common.tools import file_utils as fu 

15from biobb_common.tools.file_utils import launchlogger 

16 

17from biobb_vs.utils.common import ( 

18 align_sequences, 

19 calculate_alignment_identity, 

20 check_input_path, 

21 check_output_path, 

22 get_ligand_residues, 

23 get_pdb_sequence, 

24 get_residue_by_id, 

25) 

26 

27with warnings.catch_warnings(): 

28 warnings.simplefilter("ignore", BiopythonDeprecationWarning) 

29 # try: 

30 # import Bio.SubsMat.MatrixInfo 

31 # except ImportError: 

32 import Bio.Align.substitution_matrices 

33 import Bio.pairwise2 

34 import Bio.PDB 

35 

36 

37class BindingSite(BiobbObject): 

38 """ 

39 | biobb_vs BindingSite 

40 | This class finds the binding site of the input_pdb. 

41 | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster) 

42 

43 Args: 

44 input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476). 

45 input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987). 

46 output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476). 

47 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

48 * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one. 

49 * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom. 

50 * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered. 

51 * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS. 

52 * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty. 

53 * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty. 

54 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

55 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

56 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

57 

58 Examples: 

59 This is a use example of how to use the building block from Python:: 

60 

61 from biobb_vs.utils.bindingsite import bindingsite 

62 prop = { 

63 'ligand': 'PGA', 

64 'matrix_name': 'BLOSUM62', 

65 'gap_open': -10.0, 

66 'gap_extend': -0.5, 

67 'max_num_ligands': 15, 

68 'radius': 5 

69 } 

70 bindingsite(input_pdb_path='/path/to/myStructure.pdb', 

71 input_clusters_zip='/path/to/myCluster.zip', 

72 output_pdb_path='/path/to/newStructure.pdb', 

73 properties=prop) 

74 

75 Info: 

76 * wrapped_software: 

77 * name: In house using Biopython 

78 * version: >=1.76 

79 * license: Apache-2.0 

80 * ontology: 

81 * name: EDAM 

82 * schema: http://edamontology.org/EDAM.owl 

83 

84 """ 

85 

86 def __init__( 

87 self, 

88 input_pdb_path, 

89 input_clusters_zip, 

90 output_pdb_path, 

91 properties=None, 

92 **kwargs, 

93 ) -> None: 

94 properties = properties or {} 

95 

96 # Call parent class constructor 

97 super().__init__(properties) 

98 self.locals_var_dict = locals().copy() 

99 

100 # Input/Output files 

101 self.io_dict = { 

102 "in": { 

103 "input_pdb_path": input_pdb_path, 

104 "input_clusters_zip": input_clusters_zip, 

105 }, 

106 "out": {"output_pdb_path": output_pdb_path}, 

107 } 

108 

109 # Properties specific for BB 

110 self.ligand = properties.get("ligand", None) 

111 self.radius = float(properties.get("radius", 5.0)) 

112 self.max_num_ligands = properties.get("max_num_ligands", 15) 

113 self.matrix_name = properties.get("matrix_name", "BLOSUM62") 

114 self.gap_open = properties.get("gap_open", -10.0) 

115 self.gap_extend = properties.get("gap_extend", -0.5) 

116 self.properties = properties 

117 

118 # Check the properties 

119 self.check_properties(properties) 

120 self.check_arguments() 

121 

122 def check_data_params(self, out_log, err_log): 

123 """Checks all the input/output paths and parameters""" 

124 self.io_dict["in"]["input_pdb_path"] = check_input_path( 

125 self.io_dict["in"]["input_pdb_path"], 

126 "input_pdb_path", 

127 self.out_log, 

128 self.__class__.__name__, 

129 ) 

130 self.io_dict["in"]["input_clusters_zip"] = check_input_path( 

131 self.io_dict["in"]["input_clusters_zip"], 

132 "input_clusters_zip", 

133 self.out_log, 

134 self.__class__.__name__, 

135 ) 

136 self.io_dict["out"]["output_pdb_path"] = check_output_path( 

137 self.io_dict["out"]["output_pdb_path"], 

138 "output_pdb_path", 

139 False, 

140 self.out_log, 

141 self.__class__.__name__, 

142 ) 

143 

144 @launchlogger 

145 def launch(self) -> int: 

146 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object.""" 

147 

148 # check input/output paths and parameters 

149 self.check_data_params(self.out_log, self.err_log) 

150 

151 # Setup Biobb 

152 if self.check_restart(): 

153 return 0 

154 self.stage_files() 

155 

156 # Parse structure 

157 fu.log( 

158 "Loading input PDB structure %s" % (self.io_dict["in"]["input_pdb_path"]), 

159 self.out_log, 

160 self.global_log, 

161 ) 

162 structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name 

163 parser = Bio.PDB.PDBParser(QUIET=True) 

164 structPDB = parser.get_structure( 

165 structure_name, self.io_dict["in"]["input_pdb_path"] 

166 ) 

167 

168 if len(structPDB): 

169 structPDB = structPDB[0] 

170 

171 # Use only one chain 

172 n_chains = structPDB.get_list() 

173 if len(n_chains) != 1: 

174 fu.log( 

175 "More than one chain found in the input PDB structure. Using only the first chain to find the binding site", 

176 self.out_log, 

177 self.global_log, 

178 ) 

179 # get first chain in case there is more than one chain 

180 for struct_chain in structPDB.get_chains(): 

181 structPDB = struct_chain 

182 

183 # Get AA sequence 

184 structPDB_seq = get_pdb_sequence(structPDB) 

185 if len(structPDB_seq) == 0: 

186 fu.log( 

187 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?" 

188 % self.io_dict["in"]["input_pdb_path"], 

189 self.out_log, 

190 ) 

191 raise SystemExit( 

192 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?" 

193 % self.io_dict["in"]["input_pdb_path"] 

194 ) 

195 else: 

196 fu.log( 

197 "Found %s residues in %s" 

198 % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]), 

199 self.out_log, 

200 ) 

201 

202 # create temporary folder for decompressing the input_clusters_zip file 

203 unique_dir = PurePath(fu.create_unique_dir()) 

204 fu.log( 

205 "Creating %s temporary folder" % unique_dir, self.out_log, self.global_log 

206 ) 

207 

208 # decompress the input_clusters_zip file 

209 cluster_list = fu.unzip_list( 

210 zip_file=self.io_dict["in"]["input_clusters_zip"], 

211 dest_dir=str(unique_dir), 

212 out_log=self.out_log, 

213 ) 

214 

215 clusterPDB_ligands_aligned = [] 

216 clusterPDB_ligands_num = 0 

217 

218 fu.log("Iterating on all clusters:", self.out_log) 

219 

220 for idx, cluster_path in enumerate(cluster_list): 

221 cluster_name = PurePath(cluster_path).stem 

222 fu.log(" ", self.out_log) 

223 fu.log( 

224 "------------ Iteration #%s --------------" % (idx + 1), self.out_log 

225 ) 

226 fu.log("Cluster member: %s" % cluster_name, self.out_log) 

227 

228 # Load and Parse PDB 

229 clusterPDB = {} 

230 clusterPDB = parser.get_structure(cluster_name, cluster_path)[0] 

231 

232 # Use only the first chain 

233 for cluster_chain in clusterPDB.get_chains(): # type: ignore 

234 clusterPDB = cluster_chain 

235 

236 # Looking for ligands 

237 clusterPDB_ligands = get_ligand_residues(clusterPDB) 

238 if (len(clusterPDB_ligands)) == 0: 

239 fu.log( 

240 "No ligands found that could guide the binding site search. Ignoring this member: %s" 

241 % cluster_name, 

242 self.out_log, 

243 ) 

244 continue 

245 

246 # Selecting the largest ligand, if more than one 

247 lig_atoms_num = 0 

248 clusterPDB_ligand = {} 

249 if self.ligand: 

250 if self.ligand in [x.get_resname() for x in clusterPDB_ligands]: 

251 for lig in clusterPDB_ligands: 

252 if lig.get_resname() == self.ligand: 

253 clusterPDB_ligand = lig 

254 lig_atoms_num = len(lig.get_list()) 

255 fu.log( 

256 "Ligand found: %s (%s atoms)" 

257 % (lig.get_resname(), lig_atoms_num), 

258 self.out_log, 

259 ) 

260 else: 

261 fu.log( 

262 "Ligand %s not found in %s cluster member, skipping this cluster" 

263 % (self.ligand, cluster_name), 

264 self.out_log, 

265 ) 

266 continue 

267 else: 

268 if len(clusterPDB_ligands) > 1: 

269 for lig_res in clusterPDB_ligands: 

270 lig_res_atoms_num = len(lig_res.get_list()) 

271 fu.log( 

272 "Ligand found: %s (%s atoms)" 

273 % (lig_res.get_resname(), lig_res_atoms_num), 

274 self.out_log, 

275 ) 

276 if lig_res_atoms_num > lig_atoms_num: 

277 clusterPDB_ligand = lig_res 

278 lig_atoms_num = lig_res_atoms_num 

279 else: 

280 clusterPDB_ligand = clusterPDB_ligands[0] 

281 lig_atoms_num = len(clusterPDB_ligands[0].get_list()) 

282 

283 fu.log( 

284 "Member accepted. Valid ligand found: %s (%s atoms)" 

285 % (clusterPDB_ligand.get_resname(), lig_atoms_num), # type: ignore 

286 self.out_log, 

287 ) 

288 

289 # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues 

290 

291 # Get AA sequence 

292 clusterPDB_seq = get_pdb_sequence(clusterPDB) 

293 

294 # Pairwise align 

295 aln, residue_map = align_sequences( 

296 structPDB_seq, 

297 clusterPDB_seq, 

298 self.matrix_name, 

299 self.gap_open, 

300 self.gap_extend, 

301 ) 

302 fu.log( 

303 "Matching residues to input PDB structure. Alignment is:\n%s" 

304 % (aln[1]), 

305 self.out_log, 

306 ) 

307 

308 # Calculate (gapless) sequence identity 

309 seq_identity, gap_seq_identity = calculate_alignment_identity( 

310 aln[0], aln[1] 

311 ) 

312 fu.log("Sequence identity (%%): %s" % (seq_identity), self.out_log) 

313 fu.log("Gap less identity (%%): %s" % (gap_seq_identity), self.out_log) 

314 

315 # Selecting aligned CA atoms from first model, first chain 

316 

317 struct_atoms = [] 

318 cluster_atoms = [] 

319 

320 for struct_res in residue_map: 

321 try: 

322 cluster_atoms.append(clusterPDB[residue_map[struct_res]]["CA"]) 

323 struct_atoms.append(get_residue_by_id(structPDB, struct_res)["CA"]) 

324 except KeyError: 

325 fu.log( 

326 "Cannot find CA atom for residue %s (input PDB %s)" 

327 % ( 

328 get_residue_by_id(structPDB, struct_res).get_resname(), 

329 struct_res, 

330 ), 

331 self.out_log, 

332 ) 

333 pass 

334 

335 if len(cluster_atoms) == 0: 

336 fu.log( 

337 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member." 

338 % (cluster_name, structure_name), 

339 self.out_log, 

340 ) 

341 raise SystemExit( 

342 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member." 

343 % (cluster_name, structure_name) 

344 ) 

345 else: 

346 fu.log( 

347 "Superimposing %s aligned protein residues" % (len(cluster_atoms)), 

348 self.out_log, 

349 ) 

350 

351 # Align against input structure 

352 

353 si = Bio.PDB.Superimposer() 

354 si.set_atoms(struct_atoms, cluster_atoms) 

355 si.apply(clusterPDB.get_atoms()) # type: ignore 

356 fu.log("RMSD: %s" % (si.rms), self.out_log) 

357 

358 # Save transformed structure (and ligand) 

359 clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()] # type: ignore 

360 fu.log("Saving transformed ligand coordinates", self.out_log) 

361 

362 clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned) 

363 

364 # Stop after n accepted cluster members 

365 

366 clusterPDB_ligands_num += 1 

367 

368 if clusterPDB_ligands_num > self.max_num_ligands: 

369 break 

370 

371 fu.log(" ", self.out_log) 

372 fu.log("----------------------------------------", self.out_log) 

373 fu.log( 

374 "All transformed ligand coordinates saved, getting binding site residues", 

375 self.out_log, 

376 ) 

377 

378 # Select binding site atoms as those around cluster superimposed ligands 

379 

380 fu.log( 

381 "Defining binding site residues as those %sÅ around the %s cluster superimposed ligands" 

382 % (self.radius, clusterPDB_ligands_num), 

383 self.out_log, 

384 ) 

385 

386 # select Atoms from aligned ligands 

387 clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned] 

388 clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities( 

389 clusterPDB_ligands_aligned2, "A" 

390 ) 

391 

392 # select Atoms from input PDB structure 

393 structPDB_atoms = [atom for atom in structPDB.get_atoms()] 

394 

395 # compute neighbors for aligned ligands in the input PDB structure 

396 structPDB_bs_residues_raw = {} 

397 structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms) 

398 for ligand_atom in clusterPDB_ligands_aligned_atoms: 

399 # look for PDB atoms 5A around each ligand atom 

400 k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, "R") 

401 for k in k_l: 

402 structPDB_bs_residues_raw[k.get_id()] = k.get_full_id() 

403 

404 # Save binding site to PDB 

405 

406 io = Bio.PDB.PDBIO() 

407 fu.log( 

408 "Writing binding site residues into %s" 

409 % (self.io_dict["out"]["output_pdb_path"]), 

410 self.out_log, 

411 ) 

412 

413 # unselect input PDB atoms not in binding site 

414 structPDB_bs_atoms = 0 

415 p = re.compile("H_|W_|W") 

416 residue_ids_to_remove = [] 

417 for res in structPDB.get_residues(): 

418 if res.id not in structPDB_bs_residues_raw.keys(): 

419 # add residue to residue_ids_to_remove list 

420 residue_ids_to_remove.append(res.id) 

421 elif p.match(res.resname): 

422 # add residue to residue_ids_to_remove list 

423 residue_ids_to_remove.append(res.id) 

424 else: 

425 # this residue will be preserved 

426 structPDB_bs_atoms += len(res.get_list()) 

427 

428 # unselect input PDB atoms not in binding site 

429 for chain in structPDB: 

430 for idr in residue_ids_to_remove: 

431 chain.detach_child(idr) 

432 

433 # write PDB file 

434 io.set_structure(structPDB) 

435 io.save(self.io_dict["out"]["output_pdb_path"]) 

436 

437 # Copy files to host 

438 self.copy_to_host() 

439 

440 self.tmp_files.extend([ 

441 # self.stage_io_dict.get("unique_dir", ""), 

442 str(unique_dir) 

443 ]) 

444 self.remove_tmp_files() 

445 

446 self.check_arguments(output_files_created=True, raise_exception=False) 

447 

448 return 0 

449 

450 

451def bindingsite( 

452 input_pdb_path: str, 

453 input_clusters_zip: str, 

454 output_pdb_path: str, 

455 properties: Optional[dict] = None, 

456 **kwargs, 

457) -> int: 

458 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` class and 

459 execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method.""" 

460 

461 return BindingSite( 

462 input_pdb_path=input_pdb_path, 

463 input_clusters_zip=input_clusters_zip, 

464 output_pdb_path=output_pdb_path, 

465 properties=properties, 

466 **kwargs, 

467 ).launch() 

468 

469 bindingsite.__doc__ = BindingSite.__doc__ 

470 

471 

472def main(): 

473 """Command line execution of this building block. Please check the command line documentation.""" 

474 parser = argparse.ArgumentParser( 

475 description="Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)", 

476 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

477 ) 

478 parser.add_argument("--config", required=False, help="Configuration file") 

479 

480 # Specific args of each building block 

481 required_args = parser.add_argument_group("required arguments") 

482 required_args.add_argument( 

483 "--input_pdb_path", 

484 required=True, 

485 help="Path to the PDB structure where the binding site is to be found. Accepted formats: pdb.", 

486 ) 

487 required_args.add_argument( 

488 "--input_clusters_zip", 

489 required=True, 

490 help="Path to the ZIP file with all the PDB members of the identity cluster. Accepted formats: zip.", 

491 ) 

492 required_args.add_argument( 

493 "--output_pdb_path", 

494 required=True, 

495 help="Path to the PDB containig the residues belonging to the binding site. Accepted formats: pdb.", 

496 ) 

497 

498 args = parser.parse_args() 

499 args.config = args.config or "{}" 

500 properties = settings.ConfReader(config=args.config).get_prop_dic() 

501 

502 # Specific call of each building block 

503 bindingsite( 

504 input_pdb_path=args.input_pdb_path, 

505 input_clusters_zip=args.input_clusters_zip, 

506 output_pdb_path=args.output_pdb_path, 

507 properties=properties, 

508 ) 

509 

510 

511if __name__ == "__main__": 

512 main()