Coverage for biobb_vs/utils/bindingsite.py: 80%
176 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-20 11:08 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-20 11:08 +0000
1#!/usr/bin/env python3
3"""Module containing the BindingSite class and the command line interface."""
5import argparse
6import re
7import warnings
8from pathlib import PurePath
9from typing import Optional
11from Bio import BiopythonDeprecationWarning
12from biobb_common.configuration import settings
13from biobb_common.generic.biobb_object import BiobbObject
14from biobb_common.tools import file_utils as fu
15from biobb_common.tools.file_utils import launchlogger
17from biobb_vs.utils.common import (
18 align_sequences,
19 calculate_alignment_identity,
20 check_input_path,
21 check_output_path,
22 get_ligand_residues,
23 get_pdb_sequence,
24 get_residue_by_id,
25)
27with warnings.catch_warnings():
28 warnings.simplefilter("ignore", BiopythonDeprecationWarning)
29 # try:
30 # import Bio.SubsMat.MatrixInfo
31 # except ImportError:
32 import Bio.Align.substitution_matrices
33 import Bio.pairwise2
34 import Bio.PDB
37class BindingSite(BiobbObject):
38 """
39 | biobb_vs BindingSite
40 | This class finds the binding site of the input_pdb.
41 | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster)
43 Args:
44 input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
45 input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987).
46 output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
47 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
48 * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one.
49 * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom.
50 * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered.
51 * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS.
52 * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty.
53 * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty.
54 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
55 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
56 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
58 Examples:
59 This is a use example of how to use the building block from Python::
61 from biobb_vs.utils.bindingsite import bindingsite
62 prop = {
63 'ligand': 'PGA',
64 'matrix_name': 'BLOSUM62',
65 'gap_open': -10.0,
66 'gap_extend': -0.5,
67 'max_num_ligands': 15,
68 'radius': 5
69 }
70 bindingsite(input_pdb_path='/path/to/myStructure.pdb',
71 input_clusters_zip='/path/to/myCluster.zip',
72 output_pdb_path='/path/to/newStructure.pdb',
73 properties=prop)
75 Info:
76 * wrapped_software:
77 * name: In house using Biopython
78 * version: >=1.76
79 * license: Apache-2.0
80 * ontology:
81 * name: EDAM
82 * schema: http://edamontology.org/EDAM.owl
84 """
86 def __init__(
87 self,
88 input_pdb_path,
89 input_clusters_zip,
90 output_pdb_path,
91 properties=None,
92 **kwargs,
93 ) -> None:
94 properties = properties or {}
96 # Call parent class constructor
97 super().__init__(properties)
98 self.locals_var_dict = locals().copy()
100 # Input/Output files
101 self.io_dict = {
102 "in": {
103 "input_pdb_path": input_pdb_path,
104 "input_clusters_zip": input_clusters_zip,
105 },
106 "out": {"output_pdb_path": output_pdb_path},
107 }
109 # Properties specific for BB
110 self.ligand = properties.get("ligand", None)
111 self.radius = float(properties.get("radius", 5.0))
112 self.max_num_ligands = properties.get("max_num_ligands", 15)
113 self.matrix_name = properties.get("matrix_name", "BLOSUM62")
114 self.gap_open = properties.get("gap_open", -10.0)
115 self.gap_extend = properties.get("gap_extend", -0.5)
116 self.properties = properties
118 # Check the properties
119 self.check_properties(properties)
120 self.check_arguments()
122 def check_data_params(self, out_log, err_log):
123 """Checks all the input/output paths and parameters"""
124 self.io_dict["in"]["input_pdb_path"] = check_input_path(
125 self.io_dict["in"]["input_pdb_path"],
126 "input_pdb_path",
127 self.out_log,
128 self.__class__.__name__,
129 )
130 self.io_dict["in"]["input_clusters_zip"] = check_input_path(
131 self.io_dict["in"]["input_clusters_zip"],
132 "input_clusters_zip",
133 self.out_log,
134 self.__class__.__name__,
135 )
136 self.io_dict["out"]["output_pdb_path"] = check_output_path(
137 self.io_dict["out"]["output_pdb_path"],
138 "output_pdb_path",
139 False,
140 self.out_log,
141 self.__class__.__name__,
142 )
144 @launchlogger
145 def launch(self) -> int:
146 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object."""
148 # check input/output paths and parameters
149 self.check_data_params(self.out_log, self.err_log)
151 # Setup Biobb
152 if self.check_restart():
153 return 0
154 self.stage_files()
156 # Parse structure
157 fu.log(
158 "Loading input PDB structure %s" % (self.io_dict["in"]["input_pdb_path"]),
159 self.out_log,
160 self.global_log,
161 )
162 structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name
163 parser = Bio.PDB.PDBParser(QUIET=True)
164 structPDB = parser.get_structure(
165 structure_name, self.io_dict["in"]["input_pdb_path"]
166 )
168 if len(structPDB):
169 structPDB = structPDB[0]
171 # Use only one chain
172 n_chains = structPDB.get_list()
173 if len(n_chains) != 1:
174 fu.log(
175 "More than one chain found in the input PDB structure. Using only the first chain to find the binding site",
176 self.out_log,
177 self.global_log,
178 )
179 # get first chain in case there is more than one chain
180 for struct_chain in structPDB.get_chains():
181 structPDB = struct_chain
183 # Get AA sequence
184 structPDB_seq = get_pdb_sequence(structPDB)
185 if len(structPDB_seq) == 0:
186 fu.log(
187 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"
188 % self.io_dict["in"]["input_pdb_path"],
189 self.out_log,
190 )
191 raise SystemExit(
192 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"
193 % self.io_dict["in"]["input_pdb_path"]
194 )
195 else:
196 fu.log(
197 "Found %s residues in %s"
198 % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]),
199 self.out_log,
200 )
202 # create temporary folder for decompressing the input_clusters_zip file
203 unique_dir = PurePath(fu.create_unique_dir())
204 fu.log(
205 "Creating %s temporary folder" % unique_dir, self.out_log, self.global_log
206 )
208 # decompress the input_clusters_zip file
209 cluster_list = fu.unzip_list(
210 zip_file=self.io_dict["in"]["input_clusters_zip"],
211 dest_dir=str(unique_dir),
212 out_log=self.out_log,
213 )
215 clusterPDB_ligands_aligned = []
216 clusterPDB_ligands_num = 0
218 fu.log("Iterating on all clusters:", self.out_log)
220 for idx, cluster_path in enumerate(cluster_list):
221 cluster_name = PurePath(cluster_path).stem
222 fu.log(" ", self.out_log)
223 fu.log(
224 "------------ Iteration #%s --------------" % (idx + 1), self.out_log
225 )
226 fu.log("Cluster member: %s" % cluster_name, self.out_log)
228 # Load and Parse PDB
229 clusterPDB = {}
230 clusterPDB = parser.get_structure(cluster_name, cluster_path)[0]
232 # Use only the first chain
233 for cluster_chain in clusterPDB.get_chains(): # type: ignore
234 clusterPDB = cluster_chain
236 # Looking for ligands
237 clusterPDB_ligands = get_ligand_residues(clusterPDB)
238 if (len(clusterPDB_ligands)) == 0:
239 fu.log(
240 "No ligands found that could guide the binding site search. Ignoring this member: %s"
241 % cluster_name,
242 self.out_log,
243 )
244 continue
246 # Selecting the largest ligand, if more than one
247 lig_atoms_num = 0
248 clusterPDB_ligand = {}
249 if self.ligand:
250 if self.ligand in [x.get_resname() for x in clusterPDB_ligands]:
251 for lig in clusterPDB_ligands:
252 if lig.get_resname() == self.ligand:
253 clusterPDB_ligand = lig
254 lig_atoms_num = len(lig.get_list())
255 fu.log(
256 "Ligand found: %s (%s atoms)"
257 % (lig.get_resname(), lig_atoms_num),
258 self.out_log,
259 )
260 else:
261 fu.log(
262 "Ligand %s not found in %s cluster member, skipping this cluster"
263 % (self.ligand, cluster_name),
264 self.out_log,
265 )
266 continue
267 else:
268 if len(clusterPDB_ligands) > 1:
269 for lig_res in clusterPDB_ligands:
270 lig_res_atoms_num = len(lig_res.get_list())
271 fu.log(
272 "Ligand found: %s (%s atoms)"
273 % (lig_res.get_resname(), lig_res_atoms_num),
274 self.out_log,
275 )
276 if lig_res_atoms_num > lig_atoms_num:
277 clusterPDB_ligand = lig_res
278 lig_atoms_num = lig_res_atoms_num
279 else:
280 clusterPDB_ligand = clusterPDB_ligands[0]
281 lig_atoms_num = len(clusterPDB_ligands[0].get_list())
283 fu.log(
284 "Member accepted. Valid ligand found: %s (%s atoms)"
285 % (clusterPDB_ligand.get_resname(), lig_atoms_num), # type: ignore
286 self.out_log,
287 )
289 # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues
291 # Get AA sequence
292 clusterPDB_seq = get_pdb_sequence(clusterPDB)
294 # Pairwise align
295 aln, residue_map = align_sequences(
296 structPDB_seq,
297 clusterPDB_seq,
298 self.matrix_name,
299 self.gap_open,
300 self.gap_extend,
301 )
302 fu.log(
303 "Matching residues to input PDB structure. Alignment is:\n%s"
304 % (aln[1]),
305 self.out_log,
306 )
308 # Calculate (gapless) sequence identity
309 seq_identity, gap_seq_identity = calculate_alignment_identity(
310 aln[0], aln[1]
311 )
312 fu.log("Sequence identity (%%): %s" % (seq_identity), self.out_log)
313 fu.log("Gap less identity (%%): %s" % (gap_seq_identity), self.out_log)
315 # Selecting aligned CA atoms from first model, first chain
317 struct_atoms = []
318 cluster_atoms = []
320 for struct_res in residue_map:
321 try:
322 cluster_atoms.append(clusterPDB[residue_map[struct_res]]["CA"])
323 struct_atoms.append(get_residue_by_id(structPDB, struct_res)["CA"])
324 except KeyError:
325 fu.log(
326 "Cannot find CA atom for residue %s (input PDB %s)"
327 % (
328 get_residue_by_id(structPDB, struct_res).get_resname(),
329 struct_res,
330 ),
331 self.out_log,
332 )
333 pass
335 if len(cluster_atoms) == 0:
336 fu.log(
337 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."
338 % (cluster_name, structure_name),
339 self.out_log,
340 )
341 raise SystemExit(
342 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."
343 % (cluster_name, structure_name)
344 )
345 else:
346 fu.log(
347 "Superimposing %s aligned protein residues" % (len(cluster_atoms)),
348 self.out_log,
349 )
351 # Align against input structure
353 si = Bio.PDB.Superimposer()
354 si.set_atoms(struct_atoms, cluster_atoms)
355 si.apply(clusterPDB.get_atoms()) # type: ignore
356 fu.log("RMSD: %s" % (si.rms), self.out_log)
358 # Save transformed structure (and ligand)
359 clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()] # type: ignore
360 fu.log("Saving transformed ligand coordinates", self.out_log)
362 clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned)
364 # Stop after n accepted cluster members
366 clusterPDB_ligands_num += 1
368 if clusterPDB_ligands_num > self.max_num_ligands:
369 break
371 fu.log(" ", self.out_log)
372 fu.log("----------------------------------------", self.out_log)
373 fu.log(
374 "All transformed ligand coordinates saved, getting binding site residues",
375 self.out_log,
376 )
378 # Select binding site atoms as those around cluster superimposed ligands
380 fu.log(
381 "Defining binding site residues as those %sÅ around the %s cluster superimposed ligands"
382 % (self.radius, clusterPDB_ligands_num),
383 self.out_log,
384 )
386 # select Atoms from aligned ligands
387 clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned]
388 clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities(
389 clusterPDB_ligands_aligned2, "A"
390 )
392 # select Atoms from input PDB structure
393 structPDB_atoms = [atom for atom in structPDB.get_atoms()]
395 # compute neighbors for aligned ligands in the input PDB structure
396 structPDB_bs_residues_raw = {}
397 structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms)
398 for ligand_atom in clusterPDB_ligands_aligned_atoms:
399 # look for PDB atoms 5A around each ligand atom
400 k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, "R")
401 for k in k_l:
402 structPDB_bs_residues_raw[k.get_id()] = k.get_full_id()
404 # Save binding site to PDB
406 io = Bio.PDB.PDBIO()
407 fu.log(
408 "Writing binding site residues into %s"
409 % (self.io_dict["out"]["output_pdb_path"]),
410 self.out_log,
411 )
413 # unselect input PDB atoms not in binding site
414 structPDB_bs_atoms = 0
415 p = re.compile("H_|W_|W")
416 residue_ids_to_remove = []
417 for res in structPDB.get_residues():
418 if res.id not in structPDB_bs_residues_raw.keys():
419 # add residue to residue_ids_to_remove list
420 residue_ids_to_remove.append(res.id)
421 elif p.match(res.resname):
422 # add residue to residue_ids_to_remove list
423 residue_ids_to_remove.append(res.id)
424 else:
425 # this residue will be preserved
426 structPDB_bs_atoms += len(res.get_list())
428 # unselect input PDB atoms not in binding site
429 for chain in structPDB:
430 for idr in residue_ids_to_remove:
431 chain.detach_child(idr)
433 # write PDB file
434 io.set_structure(structPDB)
435 io.save(self.io_dict["out"]["output_pdb_path"])
437 # Copy files to host
438 self.copy_to_host()
440 self.tmp_files.extend([
441 # self.stage_io_dict.get("unique_dir", ""),
442 str(unique_dir)
443 ])
444 self.remove_tmp_files()
446 self.check_arguments(output_files_created=True, raise_exception=False)
448 return 0
451def bindingsite(
452 input_pdb_path: str,
453 input_clusters_zip: str,
454 output_pdb_path: str,
455 properties: Optional[dict] = None,
456 **kwargs,
457) -> int:
458 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` class and
459 execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method."""
461 return BindingSite(
462 input_pdb_path=input_pdb_path,
463 input_clusters_zip=input_clusters_zip,
464 output_pdb_path=output_pdb_path,
465 properties=properties,
466 **kwargs,
467 ).launch()
469 bindingsite.__doc__ = BindingSite.__doc__
472def main():
473 """Command line execution of this building block. Please check the command line documentation."""
474 parser = argparse.ArgumentParser(
475 description="Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)",
476 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999),
477 )
478 parser.add_argument("--config", required=False, help="Configuration file")
480 # Specific args of each building block
481 required_args = parser.add_argument_group("required arguments")
482 required_args.add_argument(
483 "--input_pdb_path",
484 required=True,
485 help="Path to the PDB structure where the binding site is to be found. Accepted formats: pdb.",
486 )
487 required_args.add_argument(
488 "--input_clusters_zip",
489 required=True,
490 help="Path to the ZIP file with all the PDB members of the identity cluster. Accepted formats: zip.",
491 )
492 required_args.add_argument(
493 "--output_pdb_path",
494 required=True,
495 help="Path to the PDB containig the residues belonging to the binding site. Accepted formats: pdb.",
496 )
498 args = parser.parse_args()
499 args.config = args.config or "{}"
500 properties = settings.ConfReader(config=args.config).get_prop_dic()
502 # Specific call of each building block
503 bindingsite(
504 input_pdb_path=args.input_pdb_path,
505 input_clusters_zip=args.input_clusters_zip,
506 output_pdb_path=args.output_pdb_path,
507 properties=properties,
508 )
511if __name__ == "__main__":
512 main()