Coverage for biobb_vs / utils / bindingsite.py: 85%
164 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:24 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 13:24 +0000
1#!/usr/bin/env python3
3"""Module containing the BindingSite class and the command line interface."""
4import re
5import warnings
6from pathlib import PurePath
7from typing import Optional
8from Bio import BiopythonDeprecationWarning
9from biobb_common.generic.biobb_object import BiobbObject
10from biobb_common.tools import file_utils as fu
11from biobb_common.tools.file_utils import launchlogger
13from biobb_vs.utils.common import (
14 align_sequences,
15 calculate_alignment_identity,
16 check_input_path,
17 check_output_path,
18 get_ligand_residues,
19 get_pdb_sequence,
20 get_residue_by_id,
21)
23with warnings.catch_warnings():
24 warnings.simplefilter("ignore", BiopythonDeprecationWarning)
25 # try:
26 # import Bio.SubsMat.MatrixInfo
27 # except ImportError:
28 import Bio.Align.substitution_matrices
29 import Bio.pairwise2
30 import Bio.PDB
33class BindingSite(BiobbObject):
34 """
35 | biobb_vs BindingSite
36 | This class finds the binding site of the input_pdb.
37 | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster)
39 Args:
40 input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
41 input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987).
42 output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).
43 properties (dic - Python dictionary object containing the tool parameters, not input/output files):
44 * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one.
45 * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom.
46 * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered.
47 * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS.
48 * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty.
49 * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty.
50 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
51 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
52 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
54 Examples:
55 This is a use example of how to use the building block from Python::
57 from biobb_vs.utils.bindingsite import bindingsite
58 prop = {
59 'ligand': 'PGA',
60 'matrix_name': 'BLOSUM62',
61 'gap_open': -10.0,
62 'gap_extend': -0.5,
63 'max_num_ligands': 15,
64 'radius': 5
65 }
66 bindingsite(input_pdb_path='/path/to/myStructure.pdb',
67 input_clusters_zip='/path/to/myCluster.zip',
68 output_pdb_path='/path/to/newStructure.pdb',
69 properties=prop)
71 Info:
72 * wrapped_software:
73 * name: In house using Biopython
74 * version: >=1.76
75 * license: Apache-2.0
76 * ontology:
77 * name: EDAM
78 * schema: http://edamontology.org/EDAM.owl
80 """
82 def __init__(
83 self,
84 input_pdb_path,
85 input_clusters_zip,
86 output_pdb_path,
87 properties=None,
88 **kwargs,
89 ) -> None:
90 properties = properties or {}
92 # Call parent class constructor
93 super().__init__(properties)
94 self.locals_var_dict = locals().copy()
96 # Input/Output files
97 self.io_dict = {
98 "in": {
99 "input_pdb_path": input_pdb_path,
100 "input_clusters_zip": input_clusters_zip,
101 },
102 "out": {"output_pdb_path": output_pdb_path},
103 }
105 # Properties specific for BB
106 self.ligand = properties.get("ligand", None)
107 self.radius = float(properties.get("radius", 5.0))
108 self.max_num_ligands = properties.get("max_num_ligands", 15)
109 self.matrix_name = properties.get("matrix_name", "BLOSUM62")
110 self.gap_open = properties.get("gap_open", -10.0)
111 self.gap_extend = properties.get("gap_extend", -0.5)
112 self.properties = properties
114 # Check the properties
115 self.check_properties(properties)
116 self.check_arguments()
118 def check_data_params(self, out_log, err_log):
119 """Checks all the input/output paths and parameters"""
120 self.io_dict["in"]["input_pdb_path"] = check_input_path(
121 self.io_dict["in"]["input_pdb_path"],
122 "input_pdb_path",
123 self.out_log,
124 self.__class__.__name__,
125 )
126 self.io_dict["in"]["input_clusters_zip"] = check_input_path(
127 self.io_dict["in"]["input_clusters_zip"],
128 "input_clusters_zip",
129 self.out_log,
130 self.__class__.__name__,
131 )
132 self.io_dict["out"]["output_pdb_path"] = check_output_path(
133 self.io_dict["out"]["output_pdb_path"],
134 "output_pdb_path",
135 False,
136 self.out_log,
137 self.__class__.__name__,
138 )
140 @launchlogger
141 def launch(self) -> int:
142 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object."""
144 # check input/output paths and parameters
145 self.check_data_params(self.out_log, self.err_log)
147 # Setup Biobb
148 if self.check_restart():
149 return 0
150 self.stage_files()
152 # Parse structure
153 fu.log(
154 "Loading input PDB structure %s" % (self.io_dict["in"]["input_pdb_path"]),
155 self.out_log,
156 self.global_log,
157 )
158 structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name
159 parser = Bio.PDB.PDBParser(QUIET=True)
160 structPDB = parser.get_structure(
161 structure_name, self.io_dict["in"]["input_pdb_path"]
162 )
164 if len(structPDB):
165 structPDB = structPDB[0]
167 # Use only one chain
168 n_chains = structPDB.get_list()
169 if len(n_chains) != 1:
170 fu.log(
171 "More than one chain found in the input PDB structure. Using only the first chain to find the binding site",
172 self.out_log,
173 self.global_log,
174 )
175 # get first chain in case there is more than one chain
176 for struct_chain in structPDB.get_chains():
177 structPDB = struct_chain
179 # Get AA sequence
180 structPDB_seq = get_pdb_sequence(structPDB)
181 if len(structPDB_seq) == 0:
182 fu.log(
183 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"
184 % self.io_dict["in"]["input_pdb_path"],
185 self.out_log,
186 )
187 raise SystemExit(
188 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"
189 % self.io_dict["in"]["input_pdb_path"]
190 )
191 else:
192 fu.log(
193 "Found %s residues in %s"
194 % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]),
195 self.out_log,
196 )
198 # create temporary folder for decompressing the input_clusters_zip file
199 unique_dir = PurePath(fu.create_unique_dir())
200 fu.log(
201 "Creating %s temporary folder" % unique_dir, self.out_log, self.global_log
202 )
204 # decompress the input_clusters_zip file
205 cluster_list = fu.unzip_list(
206 zip_file=self.io_dict["in"]["input_clusters_zip"],
207 dest_dir=str(unique_dir),
208 out_log=self.out_log,
209 )
211 clusterPDB_ligands_aligned = []
212 clusterPDB_ligands_num = 0
214 fu.log("Iterating on all clusters:", self.out_log)
216 for idx, cluster_path in enumerate(cluster_list):
217 cluster_name = PurePath(cluster_path).stem
218 fu.log(" ", self.out_log)
219 fu.log(
220 "------------ Iteration #%s --------------" % (idx + 1), self.out_log
221 )
222 fu.log("Cluster member: %s" % cluster_name, self.out_log)
224 # Load and Parse PDB
225 clusterPDB = {}
226 clusterPDB = parser.get_structure(cluster_name, cluster_path)[0]
228 # Use only the first chain
229 for cluster_chain in clusterPDB.get_chains(): # type: ignore
230 clusterPDB = cluster_chain
232 # Looking for ligands
233 clusterPDB_ligands = get_ligand_residues(clusterPDB)
234 if (len(clusterPDB_ligands)) == 0:
235 fu.log(
236 "No ligands found that could guide the binding site search. Ignoring this member: %s"
237 % cluster_name,
238 self.out_log,
239 )
240 continue
242 # Selecting the largest ligand, if more than one
243 lig_atoms_num = 0
244 clusterPDB_ligand = {}
245 if self.ligand:
246 if self.ligand in [x.get_resname() for x in clusterPDB_ligands]:
247 for lig in clusterPDB_ligands:
248 if lig.get_resname() == self.ligand:
249 clusterPDB_ligand = lig
250 lig_atoms_num = len(lig.get_list())
251 fu.log(
252 "Ligand found: %s (%s atoms)"
253 % (lig.get_resname(), lig_atoms_num),
254 self.out_log,
255 )
256 else:
257 fu.log(
258 "Ligand %s not found in %s cluster member, skipping this cluster"
259 % (self.ligand, cluster_name),
260 self.out_log,
261 )
262 continue
263 else:
264 if len(clusterPDB_ligands) > 1:
265 for lig_res in clusterPDB_ligands:
266 lig_res_atoms_num = len(lig_res.get_list())
267 fu.log(
268 "Ligand found: %s (%s atoms)"
269 % (lig_res.get_resname(), lig_res_atoms_num),
270 self.out_log,
271 )
272 if lig_res_atoms_num > lig_atoms_num:
273 clusterPDB_ligand = lig_res
274 lig_atoms_num = lig_res_atoms_num
275 else:
276 clusterPDB_ligand = clusterPDB_ligands[0]
277 lig_atoms_num = len(clusterPDB_ligands[0].get_list())
279 fu.log(
280 "Member accepted. Valid ligand found: %s (%s atoms)"
281 % (clusterPDB_ligand.get_resname(), lig_atoms_num), # type: ignore
282 self.out_log,
283 )
285 # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues
287 # Get AA sequence
288 clusterPDB_seq = get_pdb_sequence(clusterPDB)
290 # Pairwise align
291 aln, residue_map = align_sequences(
292 structPDB_seq,
293 clusterPDB_seq,
294 self.matrix_name,
295 self.gap_open,
296 self.gap_extend,
297 )
298 fu.log(
299 "Matching residues to input PDB structure. Alignment is:\n%s"
300 % (aln[1]),
301 self.out_log,
302 )
304 # Calculate (gapless) sequence identity
305 seq_identity, gap_seq_identity = calculate_alignment_identity(
306 aln[0], aln[1]
307 )
308 fu.log("Sequence identity (%%): %s" % (seq_identity), self.out_log)
309 fu.log("Gap less identity (%%): %s" % (gap_seq_identity), self.out_log)
311 # Selecting aligned CA atoms from first model, first chain
313 struct_atoms = []
314 cluster_atoms = []
316 for struct_res in residue_map:
317 try:
318 cluster_atoms.append(clusterPDB[residue_map[struct_res]]["CA"])
319 struct_atoms.append(get_residue_by_id(structPDB, struct_res)["CA"])
320 except KeyError:
321 fu.log(
322 "Cannot find CA atom for residue %s (input PDB %s)"
323 % (
324 get_residue_by_id(structPDB, struct_res).get_resname(),
325 struct_res,
326 ),
327 self.out_log,
328 )
329 pass
331 if len(cluster_atoms) == 0:
332 fu.log(
333 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."
334 % (cluster_name, structure_name),
335 self.out_log,
336 )
337 raise SystemExit(
338 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."
339 % (cluster_name, structure_name)
340 )
341 else:
342 fu.log(
343 "Superimposing %s aligned protein residues" % (len(cluster_atoms)),
344 self.out_log,
345 )
347 # Align against input structure
349 si = Bio.PDB.Superimposer()
350 si.set_atoms(struct_atoms, cluster_atoms)
351 si.apply(clusterPDB.get_atoms()) # type: ignore
352 fu.log("RMSD: %s" % (si.rms), self.out_log)
354 # Save transformed structure (and ligand)
355 clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()] # type: ignore
356 fu.log("Saving transformed ligand coordinates", self.out_log)
358 clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned)
360 # Stop after n accepted cluster members
362 clusterPDB_ligands_num += 1
364 if clusterPDB_ligands_num > self.max_num_ligands:
365 break
367 fu.log(" ", self.out_log)
368 fu.log("----------------------------------------", self.out_log)
369 fu.log(
370 "All transformed ligand coordinates saved, getting binding site residues",
371 self.out_log,
372 )
374 # Select binding site atoms as those around cluster superimposed ligands
376 fu.log(
377 "Defining binding site residues as those %sÅ around the %s cluster superimposed ligands"
378 % (self.radius, clusterPDB_ligands_num),
379 self.out_log,
380 )
382 # select Atoms from aligned ligands
383 clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned]
384 clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities(
385 clusterPDB_ligands_aligned2, "A"
386 )
388 # select Atoms from input PDB structure
389 structPDB_atoms = [atom for atom in structPDB.get_atoms()]
391 # compute neighbors for aligned ligands in the input PDB structure
392 structPDB_bs_residues_raw = {}
393 structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms)
394 for ligand_atom in clusterPDB_ligands_aligned_atoms:
395 # look for PDB atoms 5A around each ligand atom
396 k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, "R")
397 for k in k_l:
398 structPDB_bs_residues_raw[k.get_id()] = k.get_full_id()
400 # Save binding site to PDB
402 io = Bio.PDB.PDBIO()
403 fu.log(
404 "Writing binding site residues into %s"
405 % (self.io_dict["out"]["output_pdb_path"]),
406 self.out_log,
407 )
409 # unselect input PDB atoms not in binding site
410 structPDB_bs_atoms = 0
411 p = re.compile("H_|W_|W")
412 residue_ids_to_remove = []
413 for res in structPDB.get_residues():
414 if res.id not in structPDB_bs_residues_raw.keys():
415 # add residue to residue_ids_to_remove list
416 residue_ids_to_remove.append(res.id)
417 elif p.match(res.resname):
418 # add residue to residue_ids_to_remove list
419 residue_ids_to_remove.append(res.id)
420 else:
421 # this residue will be preserved
422 structPDB_bs_atoms += len(res.get_list())
424 # unselect input PDB atoms not in binding site
425 for chain in structPDB:
426 for idr in residue_ids_to_remove:
427 chain.detach_child(idr)
429 # write PDB file
430 io.set_structure(structPDB)
431 io.save(self.io_dict["out"]["output_pdb_path"])
433 # Copy files to host
434 self.copy_to_host()
436 self.tmp_files.append(str(unique_dir))
437 self.remove_tmp_files()
439 self.check_arguments(output_files_created=True, raise_exception=False)
441 return 0
444def bindingsite(
445 input_pdb_path: str,
446 input_clusters_zip: str,
447 output_pdb_path: str,
448 properties: Optional[dict] = None,
449 **kwargs,
450) -> int:
451 """Create the :class:`BindingSite <utils.bindingsite.BindingSite>` class and
452 execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method."""
453 return BindingSite(**dict(locals())).launch()
456bindingsite.__doc__ = BindingSite.__doc__
457main = BindingSite.get_main(bindingsite, "Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)")
460if __name__ == "__main__":
461 main()