Coverage for biobb_vs/utils/bindingsite.py: 80%

1#!/usr/bin/env python3

3"""Module containing the BindingSite class and the command line interface."""

5import argparse

6import re

7import warnings

8from pathlib import PurePath

9from typing import Optional

11from Bio import BiopythonDeprecationWarning

12from biobb_common.configuration import settings

13from biobb_common.generic.biobb_object import BiobbObject

14from biobb_common.tools import file_utils as fu

15from biobb_common.tools.file_utils import launchlogger

17from biobb_vs.utils.common import (

18 align_sequences,

19 calculate_alignment_identity,

20 check_input_path,

21 check_output_path,

22 get_ligand_residues,

23 get_pdb_sequence,

24 get_residue_by_id,

25)

27with warnings.catch_warnings():

28 warnings.simplefilter("ignore", BiopythonDeprecationWarning)

29 # try:

30 # import Bio.SubsMat.MatrixInfo

31 # except ImportError:

32 import Bio.Align.substitution_matrices

33 import Bio.pairwise2

34 import Bio.PDB

37class BindingSite(BiobbObject):

38 """

39 | biobb_vs BindingSite

40 | This class finds the binding site of the input_pdb.

41 | Finds the binding site of the input_pdb_path file based on the ligands' location of similar structures (members of the sequence identity cluster)

43 Args:

44 input_pdb_path (str): Path to the PDB structure where the binding site is to be found. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).

45 input_clusters_zip (str): Path to the ZIP file with all the PDB members of the identity cluster. File type: input. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/data/utils/bindingsite.zip>`_. Accepted formats: zip (edam:format_3987).

46 output_pdb_path (str): Path to the PDB containig the residues belonging to the binding site. File type: output. `Sample file <https://github.com/bioexcel/biobb_vs/raw/master/biobb_vs/test/reference/utils/ref_output_bindingsite.pdb>`_. Accepted formats: pdb (edam:format_1476).

47 properties (dic - Python dictionary object containing the tool parameters, not input/output files):

48 * **ligand** (*str*) - (None) Ligand to be found in the protein structure. If no ligand provided, the largest one will be selected, if more than one.

49 * **radius** (*float*) - (5.0) [0.1~1000|0.1] Cut-off distance (Ångstroms) around ligand atoms to consider a protein atom as a binding site atom.

50 * **max_num_ligands** (*int*) - (15) [0~1000|1] Total number of superimposed ligands to be extracted from the identity cluster. For populated clusters, the restriction avoids to superimpose redundant structures. If 0, all ligands extracted will be considered.

51 * **matrix_name** (*str*) - ("BLOSUM62") Substitution matrices for use in alignments. Values: BENNER22, BENNER6, BENNER74, BLASTN, BLASTP, BLOSUM45, BLOSUM50, BLOSUM62, BLOSUM80, BLOSUM90, DAYHOFF, FENG, GENETIC, GONNET1992, HOXD70, JOHNSON, JONES, LEVIN, MCLACHLAN, MDM78, MEGABLAST, NUC.4.4, PAM250, PAM30, PAM70, RAO, RISLER, SCHNEIDER, STR, TRANS.

52 * **gap_open** (*float*) - (-10.0) [-1000~1000|0.1] Gap open penalty.

53 * **gap_extend** (*float*) - (-0.5) [-1000~1000|0.1] Gap extend penalty.

54 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.

55 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

56 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.

58 Examples:

59 This is a use example of how to use the building block from Python::

61 from biobb_vs.utils.bindingsite import bindingsite

62 prop = {

63 'ligand': 'PGA',

64 'matrix_name': 'BLOSUM62',

65 'gap_open': -10.0,

66 'gap_extend': -0.5,

67 'max_num_ligands': 15,

68 'radius': 5

69 }

70 bindingsite(input_pdb_path='/path/to/myStructure.pdb',

71 input_clusters_zip='/path/to/myCluster.zip',

72 output_pdb_path='/path/to/newStructure.pdb',

73 properties=prop)

75 Info:

76 * wrapped_software:

77 * name: In house using Biopython

78 * version: >=1.76

79 * license: Apache-2.0

80 * ontology:

81 * name: EDAM

82 * schema: http://edamontology.org/EDAM.owl

84 """

86 def __init__(

87 self,

88 input_pdb_path,

89 input_clusters_zip,

90 output_pdb_path,

91 properties=None,

92 **kwargs,

93 ) -> None:

94 properties = properties or {}

96 # Call parent class constructor

97 super().__init__(properties)

98 self.locals_var_dict = locals().copy()

100 # Input/Output files

101 self.io_dict = {

102 "in": {

103 "input_pdb_path": input_pdb_path,

104 "input_clusters_zip": input_clusters_zip,

105 },

106 "out": {"output_pdb_path": output_pdb_path},

107 }

108

109 # Properties specific for BB

110 self.ligand = properties.get("ligand", None)

111 self.radius = float(properties.get("radius", 5.0))

112 self.max_num_ligands = properties.get("max_num_ligands", 15)

113 self.matrix_name = properties.get("matrix_name", "BLOSUM62")

114 self.gap_open = properties.get("gap_open", -10.0)

115 self.gap_extend = properties.get("gap_extend", -0.5)

116 self.properties = properties

117

118 # Check the properties

119 self.check_properties(properties)

120 self.check_arguments()

121

122 def check_data_params(self, out_log, err_log):

123 """Checks all the input/output paths and parameters"""

124 self.io_dict["in"]["input_pdb_path"] = check_input_path(

125 self.io_dict["in"]["input_pdb_path"],

126 "input_pdb_path",

127 self.out_log,

128 self.__class__.__name__,

129 )

130 self.io_dict["in"]["input_clusters_zip"] = check_input_path(

131 self.io_dict["in"]["input_clusters_zip"],

132 "input_clusters_zip",

133 self.out_log,

134 self.__class__.__name__,

135 )

136 self.io_dict["out"]["output_pdb_path"] = check_output_path(

137 self.io_dict["out"]["output_pdb_path"],

138 "output_pdb_path",

139 False,

140 self.out_log,

141 self.__class__.__name__,

142 )

143

144 @launchlogger

145 def launch(self) -> int:

146 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` utils.bindingsite.BindingSite object."""

147

148 # check input/output paths and parameters

149 self.check_data_params(self.out_log, self.err_log)

150

151 # Setup Biobb

152 if self.check_restart():

153 return 0

154 self.stage_files()

155

156 # Parse structure

157 fu.log(

158 "Loading input PDB structure %s" % (self.io_dict["in"]["input_pdb_path"]),

159 self.out_log,

160 self.global_log,

161 )

162 structure_name = PurePath(self.io_dict["in"]["input_pdb_path"]).name

163 parser = Bio.PDB.PDBParser(QUIET=True)

164 structPDB = parser.get_structure(

165 structure_name, self.io_dict["in"]["input_pdb_path"]

166 )

167

168 if len(structPDB):

169 structPDB = structPDB[0]

170

171 # Use only one chain

172 n_chains = structPDB.get_list()

173 if len(n_chains) != 1:

174 fu.log(

175 "More than one chain found in the input PDB structure. Using only the first chain to find the binding site",

176 self.out_log,

177 self.global_log,

178 )

179 # get first chain in case there is more than one chain

180 for struct_chain in structPDB.get_chains():

181 structPDB = struct_chain

182

183 # Get AA sequence

184 structPDB_seq = get_pdb_sequence(structPDB)

185 if len(structPDB_seq) == 0:

186 fu.log(

187 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"

188 % self.io_dict["in"]["input_pdb_path"],

189 self.out_log,

190 )

191 raise SystemExit(

192 self.__class__.__name__ + ": Cannot extract AA sequence from the input PDB structure %s. Wrong format?"

193 % self.io_dict["in"]["input_pdb_path"]

194 )

195 else:

196 fu.log(

197 "Found %s residues in %s"

198 % (len(structPDB_seq), self.io_dict["in"]["input_pdb_path"]),

199 self.out_log,

200 )

201

202 # create temporary folder for decompressing the input_clusters_zip file

203 unique_dir = PurePath(fu.create_unique_dir())

204 fu.log(

205 "Creating %s temporary folder" % unique_dir, self.out_log, self.global_log

206 )

207

208 # decompress the input_clusters_zip file

209 cluster_list = fu.unzip_list(

210 zip_file=self.io_dict["in"]["input_clusters_zip"],

211 dest_dir=str(unique_dir),

212 out_log=self.out_log,

213 )

214

215 clusterPDB_ligands_aligned = []

216 clusterPDB_ligands_num = 0

217

218 fu.log("Iterating on all clusters:", self.out_log)

219

220 for idx, cluster_path in enumerate(cluster_list):

221 cluster_name = PurePath(cluster_path).stem

222 fu.log(" ", self.out_log)

223 fu.log(

224 "------------ Iteration #%s --------------" % (idx + 1), self.out_log

225 )

226 fu.log("Cluster member: %s" % cluster_name, self.out_log)

227

228 # Load and Parse PDB

229 clusterPDB = {}

230 clusterPDB = parser.get_structure(cluster_name, cluster_path)[0]

231

232 # Use only the first chain

233 for cluster_chain in clusterPDB.get_chains(): # type: ignore

234 clusterPDB = cluster_chain

235

236 # Looking for ligands

237 clusterPDB_ligands = get_ligand_residues(clusterPDB)

238 if (len(clusterPDB_ligands)) == 0:

239 fu.log(

240 "No ligands found that could guide the binding site search. Ignoring this member: %s"

241 % cluster_name,

242 self.out_log,

243 )

244 continue

245

246 # Selecting the largest ligand, if more than one

247 lig_atoms_num = 0

248 clusterPDB_ligand = {}

249 if self.ligand:

250 if self.ligand in [x.get_resname() for x in clusterPDB_ligands]:

251 for lig in clusterPDB_ligands:

252 if lig.get_resname() == self.ligand:

253 clusterPDB_ligand = lig

254 lig_atoms_num = len(lig.get_list())

255 fu.log(

256 "Ligand found: %s (%s atoms)"

257 % (lig.get_resname(), lig_atoms_num),

258 self.out_log,

259 )

260 else:

261 fu.log(

262 "Ligand %s not found in %s cluster member, skipping this cluster"

263 % (self.ligand, cluster_name),

264 self.out_log,

265 )

266 continue

267 else:

268 if len(clusterPDB_ligands) > 1:

269 for lig_res in clusterPDB_ligands:

270 lig_res_atoms_num = len(lig_res.get_list())

271 fu.log(

272 "Ligand found: %s (%s atoms)"

273 % (lig_res.get_resname(), lig_res_atoms_num),

274 self.out_log,

275 )

276 if lig_res_atoms_num > lig_atoms_num:

277 clusterPDB_ligand = lig_res

278 lig_atoms_num = lig_res_atoms_num

279 else:

280 clusterPDB_ligand = clusterPDB_ligands[0]

281 lig_atoms_num = len(clusterPDB_ligands[0].get_list())

282

283 fu.log(

284 "Member accepted. Valid ligand found: %s (%s atoms)"

285 % (clusterPDB_ligand.get_resname(), lig_atoms_num), # type: ignore

286 self.out_log,

287 )

288

289 # Mapping residues by sequence alignment to match structPDB-clusterPDB paired residues

290

291 # Get AA sequence

292 clusterPDB_seq = get_pdb_sequence(clusterPDB)

293

294 # Pairwise align

295 aln, residue_map = align_sequences(

296 structPDB_seq,

297 clusterPDB_seq,

298 self.matrix_name,

299 self.gap_open,

300 self.gap_extend,

301 )

302 fu.log(

303 "Matching residues to input PDB structure. Alignment is:\n%s"

304 % (aln[1]),

305 self.out_log,

306 )

307

308 # Calculate (gapless) sequence identity

309 seq_identity, gap_seq_identity = calculate_alignment_identity(

310 aln[0], aln[1]

311 )

312 fu.log("Sequence identity (%%): %s" % (seq_identity), self.out_log)

313 fu.log("Gap less identity (%%): %s" % (gap_seq_identity), self.out_log)

314

315 # Selecting aligned CA atoms from first model, first chain

316

317 struct_atoms = []

318 cluster_atoms = []

319

320 for struct_res in residue_map:

321 try:

322 cluster_atoms.append(clusterPDB[residue_map[struct_res]]["CA"])

323 struct_atoms.append(get_residue_by_id(structPDB, struct_res)["CA"])

324 except KeyError:

325 fu.log(

326 "Cannot find CA atom for residue %s (input PDB %s)"

327 % (

328 get_residue_by_id(structPDB, struct_res).get_resname(),

329 struct_res,

330 ),

331 self.out_log,

332 )

333 pass

334

335 if len(cluster_atoms) == 0:

336 fu.log(

337 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."

338 % (cluster_name, structure_name),

339 self.out_log,

340 )

341 raise SystemExit(

342 self.__class__.__name__ + ": Cannot find CA atoms (1st model, 1st chain) in cluster member %s when aligning against %s. Ignoring this member."

343 % (cluster_name, structure_name)

344 )

345 else:

346 fu.log(

347 "Superimposing %s aligned protein residues" % (len(cluster_atoms)),

348 self.out_log,

349 )

350

351 # Align against input structure

352

353 si = Bio.PDB.Superimposer()

354 si.set_atoms(struct_atoms, cluster_atoms)

355 si.apply(clusterPDB.get_atoms()) # type: ignore

356 fu.log("RMSD: %s" % (si.rms), self.out_log)

357

358 # Save transformed structure (and ligand)

359 clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()] # type: ignore

360 fu.log("Saving transformed ligand coordinates", self.out_log)

361

362 clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned)

363

364 # Stop after n accepted cluster members

365

366 clusterPDB_ligands_num += 1

367

368 if clusterPDB_ligands_num > self.max_num_ligands:

369 break

370

371 fu.log(" ", self.out_log)

372 fu.log("----------------------------------------", self.out_log)

373 fu.log(

374 "All transformed ligand coordinates saved, getting binding site residues",

375 self.out_log,

376 )

377

378 # Select binding site atoms as those around cluster superimposed ligands

379

380 fu.log(

381 "Defining binding site residues as those %sÅ around the %s cluster superimposed ligands"

382 % (self.radius, clusterPDB_ligands_num),

383 self.out_log,

384 )

385

386 # select Atoms from aligned ligands

387 clusterPDB_ligands_aligned2 = [res for res in clusterPDB_ligands_aligned]

388 clusterPDB_ligands_aligned_atoms = Bio.PDB.Selection.unfold_entities(

389 clusterPDB_ligands_aligned2, "A"

390 )

391

392 # select Atoms from input PDB structure

393 structPDB_atoms = [atom for atom in structPDB.get_atoms()]

394

395 # compute neighbors for aligned ligands in the input PDB structure

396 structPDB_bs_residues_raw = {}

397 structPDB_neighbors = Bio.PDB.NeighborSearch(structPDB_atoms)

398 for ligand_atom in clusterPDB_ligands_aligned_atoms:

399 # look for PDB atoms 5A around each ligand atom

400 k_l = structPDB_neighbors.search(ligand_atom.coord, self.radius, "R")

401 for k in k_l:

402 structPDB_bs_residues_raw[k.get_id()] = k.get_full_id()

403

404 # Save binding site to PDB

405

406 io = Bio.PDB.PDBIO()

407 fu.log(

408 "Writing binding site residues into %s"

409 % (self.io_dict["out"]["output_pdb_path"]),

410 self.out_log,

411 )

412

413 # unselect input PDB atoms not in binding site

414 structPDB_bs_atoms = 0

415 p = re.compile("H_|W_|W")

416 residue_ids_to_remove = []

417 for res in structPDB.get_residues():

418 if res.id not in structPDB_bs_residues_raw.keys():

419 # add residue to residue_ids_to_remove list

420 residue_ids_to_remove.append(res.id)

421 elif p.match(res.resname):

422 # add residue to residue_ids_to_remove list

423 residue_ids_to_remove.append(res.id)

424 else:

425 # this residue will be preserved

426 structPDB_bs_atoms += len(res.get_list())

427

428 # unselect input PDB atoms not in binding site

429 for chain in structPDB:

430 for idr in residue_ids_to_remove:

431 chain.detach_child(idr)

432

433 # write PDB file

434 io.set_structure(structPDB)

435 io.save(self.io_dict["out"]["output_pdb_path"])

436

437 # Copy files to host

438 self.copy_to_host()

439

440 self.tmp_files.extend([

441 # self.stage_io_dict.get("unique_dir", ""),

442 str(unique_dir)

443 ])

444 self.remove_tmp_files()

445

446 self.check_arguments(output_files_created=True, raise_exception=False)

447

448 return 0

449

450

451def bindingsite(

452 input_pdb_path: str,

453 input_clusters_zip: str,

454 output_pdb_path: str,

455 properties: Optional[dict] = None,

456 **kwargs,

457) -> int:

458 """Execute the :class:`BindingSite <utils.bindingsite.BindingSite>` class and

459 execute the :meth:`launch() <utils.bindingsite.BindingSite.launch>` method."""

460

461 return BindingSite(

462 input_pdb_path=input_pdb_path,

463 input_clusters_zip=input_clusters_zip,

464 output_pdb_path=output_pdb_path,

465 properties=properties,

466 **kwargs,

467 ).launch()

468

469 bindingsite.__doc__ = BindingSite.__doc__

470

471

472def main():

473 """Command line execution of this building block. Please check the command line documentation."""

474 parser = argparse.ArgumentParser(

475 description="Finds the binding site of the input_pdb file based on the ligands' location of similar structures (members of the sequence identity cluster)",

476 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999),

477 )

478 parser.add_argument("--config", required=False, help="Configuration file")

479

480 # Specific args of each building block

481 required_args = parser.add_argument_group("required arguments")

482 required_args.add_argument(

483 "--input_pdb_path",

484 required=True,

485 help="Path to the PDB structure where the binding site is to be found. Accepted formats: pdb.",

486 )

487 required_args.add_argument(

488 "--input_clusters_zip",

489 required=True,

490 help="Path to the ZIP file with all the PDB members of the identity cluster. Accepted formats: zip.",

491 )

492 required_args.add_argument(

493 "--output_pdb_path",

494 required=True,

495 help="Path to the PDB containig the residues belonging to the binding site. Accepted formats: pdb.",

496 )

497

498 args = parser.parse_args()

499 args.config = args.config or "{}"

500 properties = settings.ConfReader(config=args.config).get_prop_dic()

501

502 # Specific call of each building block

503 bindingsite(

504 input_pdb_path=args.input_pdb_path,

505 input_clusters_zip=args.input_clusters_zip,

506 output_pdb_path=args.output_pdb_path,

507 properties=properties,

508 )

509

510

511if __name__ == "__main__":

512 main()