Coverage for biobb_model/model/fix_pdb

1import json

2import math

3import os

4import urllib.request

5from bisect import bisect

6from typing import Optional, Union

8import xmltodict # type: ignore

9from Bio import pairwise2 # type: ignore

10from Bio.Blast import NCBIWWW # type: ignore

11from Bio.pairwise2 import format_alignment # type: ignore

13Coords = tuple[float, float, float]

15try:

16 from Bio.SubsMat import MatrixInfo # type: ignore

18 blosum62 = MatrixInfo.blosum62

19except ImportError:

20 from Bio.Align import substitution_matrices # type: ignore

22 blosum62 = substitution_matrices.load("BLOSUM62")

25# An atom

26class Atom:

27 def __init__(

28 self,

29 name: Optional[str] = None,

30 element: Optional[str] = None,

31 coords: Optional[Coords] = None,

32 ):

33 self.name = name

34 self.element = element

35 self.coords = coords

36 # Set variables to store references to other related instances

37 # These variables will be set further by the structure

38 self._structure: Optional[Structure] = None

39 self._index: Optional[int] = None

40 self._residue_index: Optional[int] = None

42 def __repr__(self):

43 return "<Atom " + str(self.name) + ">"

45 def __eq__(self, other):

46 return self._residue_index == other._residue_index and self.name == other.name

48 # The parent structure (read only)

49 # This value is set by the structure itself

50 def get_structure(self):

51 return self._structure

53 structure = property(get_structure, None, None, "The parent structure (read only)")

55 # The residue index according to parent structure residues (read only)

56 # This value is set by the structure itself

57 def get_index(self):

58 return self._index

60 index = property(

61 get_index,

62 None,

63 None,

64 "The residue index according to parent structure residues (read only)",

65 )

67 # The atom residue index according to parent structure residues

68 # If residue index is set then make changes in all the structure to make this change coherent

69 def get_residue_index(self) -> Optional[int]:

70 return self._residue_index

72 def set_residue_index(self, new_residue_index: int):

73 # If there is not strucutre yet it means the residue is beeing set before the structure

74 # We just save the residue index and wait for the structure to be set

75 if not self.structure:

76 self._residue_index = new_residue_index

77 return

78 # Relational indices are updated through a top-down hierarchy

79 # Affected residues are the ones to update this atom internal residue index

80 current_residue = self.residue

81 current_residue.remove_atom(self)

82 new_residue = self.structure.residues[new_residue_index]

83 new_residue.add_atom(self)

85 residue_index = property(

86 get_residue_index,

87 set_residue_index,

88 None,

89 "The atom residue index according to parent structure residues",

90 )

92 # The atom residue

93 # If residue is set then make changes in all the structure to make this change coherent

94 def get_residue(self) -> Optional["Residue"]:

95 # If there is not strucutre yet it means the atom is beeing set before the structure

96 # In this case it is not possible to get related residues in the structure

97 if not self.structure:

98 return None

99 # Get the residue in the structure according to the residue index

100 return self.structure.residues[self.residue_index]

101

102 def set_residue(self, new_residue: "Residue"):

103 # Find the new residue index and set it as the atom residue index

104 # Note that the residue must be set in the structure already

105 new_residue_index = new_residue.index

106 if new_residue_index is None:

107 raise ValueError(

108 "Residue " + str(new_residue) + " is not set in the structure"

109 )

110 self.set_residue_index(new_residue_index)

111

112 residue = property(get_residue, set_residue, None, "The atom residue")

113

114 # The atom chain index according to parent structure chains (read only)

115 # In order to change the chain index it must be changed in the atom residue

116 def get_chain_index(self) -> int:

117 return self.residue.chain_index

118

119 chain_index = property(

120 get_chain_index,

121 None,

122 None,

123 "The atom chain index according to parent structure chains (read only)",

124 )

125

126 # The atom chain(read only)

127 # In order to change the chain it must be changed in the atom residue

128 def get_chain(self) -> "Chain":

129 # Get the chain in the structure according to the chain index

130 return self.structure.chains[self.chain_index]

131

132 chain = property(get_chain, None, None, "The atom chain (read only)")

133

134

135# A residue

136class Residue:

137 def __init__(

138 self,

139 name: Optional[str] = None,

140 number: Optional[int] = None,

141 icode: Optional[str] = None,

142 ):

143 self.name = name

144 self.number = number

145 self.icode = icode

146 # Set variables to store references to other related instaces

147 # These variables will be set further by the structure

148 self._structure: Optional[Structure] = None

149 self._index: Optional[int] = None

150 self._atom_indices: list[int] = []

151 self._chain_index: Optional[int] = None

152

153 def __repr__(self):

154 return (

155 "<Residue " + str(self.name) + str(self.number) + (self.icode if self.icode else "") + ">"

156 )

157

158 def __eq__(self, other):

159 if not isinstance(self, type(other)):

160 return False

161 return (

162 self._chain_index == other._chain_index and self.number == other.number and self.icode == other.icode

163 )

164

165 def __hash__(self):

166 # WARNING: This is susceptible to duplicated residues

167 return hash((self._chain_index, self.number, self.icode))

168 # WARNING: This is not susceptible to duplicated residues

169 # return hash(tuple(self._atom_indices))

170

171 def same_inputs_as(self, other) -> bool:

172 return (

173 self.name == other.name and self.number == other.number and self.icode == other.icode

174 )

175

176 # The parent structure (read only)

177 # This value is set by the structure itself

178 def get_structure(self):

179 return self._structure

180

181 structure = property(get_structure, None, None, "The parent structure (read only)")

182

183 # The residue index according to parent structure residues (read only)

184 # This value is set by the structure itself

185 def get_index(self):

186 return self._index

187

188 index = property(

189 get_index,

190 None,

191 None,

192 "The residue index according to parent structure residues(read only)",

193 )

194

195 # The atom indices according to parent structure atoms for atoms in this residue

196 # If atom indices are set then make changes in all the structure to make this change coherent

197 def get_atom_indices(self) -> list[int]:

198 return self._atom_indices

199

200 def set_atom_indices(self, new_atom_indices: list[int]):

201 # If there is not strucutre yet it means the residue is beeing set before the structure

202 # We just save atom indices and wait for the structure to be set

203 if not self.structure:

204 self._atom_indices = new_atom_indices

205 return

206 # Update the current atoms

207 for atom in self.atoms:

208 atom._residue_index = None

209 # Update the new atoms

210 for index in new_atom_indices:

211 atom = self.structure.atoms[index]

212 atom._residue_index = self.index

213 # Now new indices are coherent and thus we can save them

214 self._atom_indices = new_atom_indices

215

216 atom_indices = property(

217 get_atom_indices,

218 set_atom_indices,

219 None,

220 "The atom indices according to parent structure atoms for atoms in this residue",

221 )

222

223 # The atoms in this residue

224 # If atoms are set then make changes in all the structure to make this change coherent

225 def get_atoms(self) -> list["Atom"]:

226 # If there is not strucutre yet it means the chain is beeing set before the structure

227 # In this case it is not possible to get related atoms in the structure

228 if not self.structure:

229 return []

230 # Get atoms in the structure according to atom indices

231 atoms = self.structure.atoms

232 return [atoms[atom_index] for atom_index in self.atom_indices]

233

234 def set_atoms(self, new_atoms: list["Atom"]):

235 # Find indices for new atoms and set their indices as the new atom indices

236 # Note that atoms must be set in the structure already

237 new_atom_indices = []

238 for new_atom in new_atoms:

239 new_atom_index = new_atom.index

240 if new_atom_index is None:

241 raise ValueError(

242 "Atom " + str(new_atom) + " is not set in the structure"

243 )

244 new_atom_indices.append(new_atom_index)

245 self.set_atom_indices(new_atom_indices)

246

247 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue")

248

249 # Add an atom to the residue

250 def add_atom(self, new_atom: "Atom"):

251 # Insert the new atom index in the list of atom indices keeping the order

252 new_atom_index = new_atom.index

253 sorted_atom_index = bisect(self.atom_indices, new_atom_index)

254 self.atom_indices.insert(sorted_atom_index, new_atom_index)

255 # Update the atom internal index

256 new_atom._residue_index = self.index

257

258 # Remove an atom from the residue

259 def remove_atom(self, current_atom: "Atom"):

260 # Remove the current atom index from the atom indices list

261 self.atom_indices.remove(current_atom.index) # This index MUST be in the list

262 # Update the atom internal index

263 current_atom._residue_index = None

264

265 # The residue chain index according to parent structure chains

266 # If chain index is set then make changes in all the structure to make this change coherent

267 def get_chain_index(self) -> Optional[int]:

268 return self._chain_index

269

270 def set_chain_index(self, new_chain_index: int):

271 # If there is not strucutre yet it means the chain is beeing set before the structure

272 # We just save the chain index and wait for the structure to be set

273 if not self.structure:

274 self._chain_index = new_chain_index

275 return

276 # Relational indices are updated through a top-down hierarchy

277 # Affected chains are the ones to update this residue internal chain index

278 # WARNING: It is critical to find both current and new chains before removing/adding residues

279 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged

280 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed

281 current_chain = self.chain

282 new_chain = self.structure.chains[new_chain_index]

283 current_chain.remove_residue(self)

284 new_chain.add_residue(self)

285

286 chain_index = property(

287 get_chain_index,

288 set_chain_index,

289 None,

290 "The residue chain index according to parent structure chains",

291 )

292

293 # The residue chain

294 # If chain is set then make changes in all the structure to make this change coherent

295 def get_chain(self) -> Union["Chain", list]:

296 # If there is not strucutre yet it means the residue is beeing set before the structure

297 # In this case it is not possible to get related chain in the structure

298 if not self.structure:

299 return []

300 # Get the chain in the structure according to the chain index

301 return self.structure.chains[self.chain_index]

302

303 def set_chain(self, new_chain: Union["Chain", str]):

304 # In case the chain is just a string we must find/create the corresponding chain

305 if isinstance(new_chain, str):

306 letter = new_chain

307 # Get the residue structure

308 structure = self.structure

309 if not structure:

310 raise ValueError(

311 "Cannot find the corresponding " + new_chain + " chain without the structure"

312 )

313 # Find if the letter belongs to an already existing chain

314 new_chain = structure.get_chain_by_name(letter)

315 # If the chain does not exist yet then create it

316 if not new_chain:

317 new_chain = Chain(name=letter)

318 structure.set_new_chain(new_chain)

319 # Find the new chain index and set it as the residue chain index

320 # Note that the chain must be set in the structure already

321 new_chain_index = new_chain.index

322 if new_chain_index is None:

323 raise ValueError("Chain " + str(new_chain) + " is not set in the structure")

324 if isinstance(new_chain_index, int):

325 self.set_chain_index(new_chain_index)

326

327 chain = property(get_chain, set_chain, None, "The residue chain")

328

329

330# A chain

331class Chain:

332 def __init__(self, name: Optional[str] = None):

333 self.name = name

334 # Set variables to store references to other related instaces

335 # These variables will be set further by the structure

336 self._structure: Optional[Structure] = None

337 self._index: Optional[int] = None

338 self.residue_indices = []

339

340 def __repr__(self):

341 return "<Chain " + str(self.name) + ">"

342

343 def __eq__(self, other):

344 return self.name == other.name

345

346 # The parent structure(read only)

347 # This value is set by the structure itself

348 def get_structure(self):

349 return self._structure

350

351 structure = property(get_structure, None, None, "The parent structure(read only)")

352

353 # The residue index according to parent structure residues(read only)

354 # This value is set by the structure itself

355 def get_index(self):

356 return self._index

357

358 # When the index is set all residues are updated with the nex chain index

359 def set_index(self, index: int):

360 for residue in self.residues:

361 residue._chain_index = index

362 self._index = index

363

364 index = property(

365 get_index,

366 set_index,

367 None,

368 "The residue index according to parent structure residues(read only)",

369 )

370

371 # The residue indices according to parent structure residues for residues in this chain

372 # If residue indices are set then make changes in all the structure to make this change coherent

373 def get_residue_indices(self) -> list[int]:

374 return self._residue_indices

375

376 def set_residue_indices(self, new_residue_indices: list[int]):

377 # If there is not strucutre yet it means the chain is beeing set before the structure

378 # We just save residue indices and wait for the structure to be set

379 if not self.structure:

380 self._residue_indices = new_residue_indices

381 return

382 # Update the current residues

383 for residue in self.residues:

384 residue._chain_index = None

385 # Update the new residues

386 for index in new_residue_indices:

387 residue = self.structure.residues[index]

388 residue._chain_index = self.index

389 # In case the new residue indices list is empty this chain must be removed from its structure

390 if len(new_residue_indices) == 0:

391 self.structure.purge_chain(self)

392 # Now new indices are coherent and thus we can save them

393 self._residue_indices = new_residue_indices

394

395 residue_indices = property(

396 get_residue_indices,

397 set_residue_indices,

398 None,

399 "The residue indices according to parent structure residues for residues in this residue",

400 )

401

402 # The residues in this chain

403 # If residues are set then make changes in all the structure to make this change coherent

404 def get_residues(self) -> list["Residue"]:

405 # If there is not strucutre yet it means the chain is beeing set before the structure

406 # In this case it is not possible to get related residues in the structure

407 if not self.structure:

408 return []

409 # Get residues in the structure according to residue indices

410 residues = self.structure.residues

411 return [residues[residue_index] for residue_index in self.residue_indices]

412

413 def set_residues(self, new_residues: list["Residue"]):

414 # Find indices for new residues and set their indices as the new residue indices

415 # Note that residues must be set in the structure already

416 new_residue_indices = []

417 for new_residue in new_residues:

418 new_residue_index = new_residue.index

419 if new_residue_index is None:

420 raise ValueError(

421 "Residue " + str(new_residue) + " is not set in the structure"

422 )

423 new_residue_indices.append(new_residue_index)

424 self.set_residue_indices(new_residue_indices)

425

426 residues = property(get_residues, set_residues, None, "The residues in this chain")

427

428 # Add a residue to the chain

429 def add_residue(self, residue: "Residue"):

430 # Insert the new residue index in the list of residue indices keeping the order

431 sorted_residue_index = bisect(self.residue_indices, residue.index)

432 self.residue_indices.insert(sorted_residue_index, residue.index)

433 # Update the residue internal chain index

434 residue._chain_index = self.index

435

436 # Remove a residue from the chain

437 # WARNING: Note that this function does not trigger the set_residue_indices

438 def remove_residue(self, residue: "Residue"):

439 self.residue_indices.remove(residue.index) # This index MUST be in the list

440 # If we removed the last index then this chain must be removed from its structure

441 if len(self.residue_indices) == 0 and self.structure:

442 self.structure.purge_chain(self)

443 # Update the residue internal chain index

444 residue._chain_index = None

445

446 # Atom indices for all atoms in the chain(read only)

447 # In order to change atom indices they must be changed in their corresponding residues

448 def get_atom_indices(self) -> list[int]:

449 return sum([residue.atom_indices for residue in self.residues], [])

450

451 atom_indices = property(

452 get_atom_indices,

453 None,

454 None,

455 "Atom indices for all atoms in the chain(read only)",

456 )

457

458 # Atoms in the chain(read only)

459 # In order to change atoms they must be changed in their corresponding residues

460 def get_atoms(self) -> list[int]:

461 return sum([residue.atoms for residue in self.residues], [])

462

463 atoms = property(get_atoms, None, None, "Atoms in the chain(read only)")

464

465 # Get the residues sequence in one-letter code

466 def get_sequence(self) -> str:

467 return "".join(

468 [residue_name_2_letter(residue.name) for residue in self.residues]

469 )

470

471

472# A structure is a group of atoms organized in chains and residues

473class Structure:

474 def __init__(

475 self,

476 atoms: list["Atom"] = [],

477 residues: list["Residue"] = [],

478 chains: list["Chain"] = [],

479 ):

480 self.atoms: list[Atom] = []

481 self.residues: list[Residue] = []

482 self.chains: list[Chain] = []

483 # Set references between instances

484 for atom in atoms:

485 self.set_new_atom(atom)

486 for residue in residues:

487 self.set_new_residue(residue)

488 for chain in chains:

489 self.set_new_chain(chain)

490

491 def __repr__(self):

492 return "<Structure(" + str(len(self.atoms)) + " atoms)>"

493

494 # Set a new atom in the structure

495 def set_new_atom(self, atom: "Atom"):

496 atom._structure = self

497 new_atom_index = len(self.atoms)

498 self.atoms.append(atom)

499 atom._index = new_atom_index

500

501 # Set a new residue in the structure

502 # WARNING: Atoms must be set already before setting residues

503 def set_new_residue(self, residue: "Residue"):

504 residue._structure = self

505 new_residue_index = len(self.residues)

506 self.residues.append(residue)

507 residue._index = new_residue_index

508 # In case the residue has atom indices, set relational indices on each atom

509 for atom_index in residue.atom_indices:

510 atom = self.atoms[atom_index]

511 atom._residue_index = new_residue_index

512

513 # Set a new chain in the structure

514 # WARNING: Residues and atoms must be set already before setting chains

515 def set_new_chain(self, chain: "Chain"):

516 chain._structure = self

517 new_chain_index = len(self.chains)

518 self.chains.append(chain)

519 chain._index = new_chain_index

520 # In case the chain has residue indices, set relational indices on each residue

521 for residue_index in chain.residue_indices:

522 residue = self.residues[residue_index]

523 residue._chain_index = new_chain_index

524

525 # Purge chain from the structure

526 # This can be done only when the chain has no residues left in the structure

527 # Renumerate all chain indices which have been offsetted as a result of the purge

528 def purge_chain(self, chain: "Chain"):

529 # Check the chain can be purged

530 if chain not in self.chains:

531 raise ValueError(

532 "Chain " + str(chain.name) + " is not in the structure already"

533 )

534 if len(chain.residue_indices) > 0:

535 raise ValueError(

536 "Chain " + str(chain.name) + " is still having residues and thus it cannot be purged"

537 )

538 # Get the current index of the chain to be purged

539 purged_index = chain.index

540 # Chains and their residues below this index are not to be modified

541 # Chains and their residues over this index must be renumerated

542 for affected_chain in self.chains[purged_index + 1:]:

543 # Chaging the index automatically changes all chain residues '_chain_index' values.

544 affected_chain.index -= 1

545 # Finally, remove the current chain from the list of chains in the structure

546 self.chains.remove(chain)

547

548 # Set the structure from a pdb file

549 @classmethod

550 def from_pdb_file(cls, pdb_filename: str):

551 if not os.path.exists(pdb_filename):

552 raise SystemExit('File "' + pdb_filename + '" not found')

553 # Read the pdb file line by line and set the parsed atoms, residues and chains

554 parsed_atoms = []

555 parsed_residues: list[Residue] = []

556 parsed_chains: list[Chain] = []

557 atom_index = -1

558 residue_index = -1

559 with open(pdb_filename, "r") as file:

560 for line in file:

561 # Parse atoms only

562 start = line[0:6]

563 is_atom = start == "ATOM " or start == "HETATM"

564 if not is_atom:

565 continue

566 # Mine all atom data

567 atom_name = line[11:16].strip()

568 residue_name = line[17:21].strip()

569 chain = line[21:22]

570 residue_number = int(line[22:26])

571 icode = line[26:27]

572 if icode == " ":

573 icode = ""

574 x_coord = float(line[30:38])

575 y_coord = float(line[38:46])

576 z_coord = float(line[46:54])

577 element = line[77:79].strip()

578 # Set the parsed atom, residue and chain

579 parsed_atom = Atom(

580 name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)

581 )

582 parsed_residue = Residue(

583 name=residue_name, number=residue_number, icode=icode

584 )

585 parsed_chain = Chain(name=chain)

586 # Add the parsed atom to the list and update the current atom index

587 parsed_atoms.append(parsed_atom)

588 atom_index += 1

589 # Check if we are in the same chain/residue than before

590 same_chain = parsed_chains and parsed_chains[-1] == parsed_chain

591 same_residue = (

592 same_chain and parsed_residues and parsed_residue.same_inputs_as(parsed_residues[-1])

593 )

594 # Update the residue atom indices

595 # If the residue equals the last parsed residue then use the previous instead

596 if same_residue:

597 parsed_residue = parsed_residues[-1]

598 parsed_residue.atom_indices.append(atom_index)

599 # If it is the same residue then it will be the same chain as well so we can proceed

600 continue

601 # Otherwise, include the new residue in the list and update the current residue index

602 parsed_residues.append(parsed_residue)

603 residue_index += 1

604 parsed_residue.atom_indices.append(atom_index)

605 # If the chain equals the last parsed chain then use the previous instead

606 if same_chain:

607 parsed_chain = parsed_chains[-1]

608 parsed_chain.residue_indices.append(residue_index)

609 continue

610 # Otherwise, include the new chain in the list

611 parsed_chains.append(parsed_chain)

612 parsed_chain.residue_indices.append(residue_index)

613 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)

614

615 # Fix atom elements by gueesing them when missing

616 # Set all elements with the first letter upper and the second(if any) lower

617 # def fix_atom_elements(self):

618 # for atom in self.atoms:

619 # # Make sure elements have the first letter cap and the second letter not cap

620 # if atom.element:

621 # atom.element = first_cap_only(atom.element)

622 # # If elements are missing then guess them from atom names

623 # else:

624 # atom.element = guess_name_element(atom.name)

625

626 # Generate a pdb file with current structure

627 def generate_pdb_file(self, pdb_filename: str):

628 with open(pdb_filename, "w") as file:

629 file.write("REMARK mdtoolbelt generated pdb file\n")

630 for a, atom in enumerate(self.atoms):

631 residue = atom.residue

632 index = str((a + 1) % 100000).rjust(5)

633 name = (

634 " " + str(atom.name).ljust(3)

635 if len(str(atom.name)) < 4

636 else atom.name

637 )

638 residue_name = residue.name.ljust(4)

639 chain = atom.chain.name.rjust(1)

640 residue_number = str(residue.number).rjust(4)

641 icode = residue.icode.rjust(1)

642 coords: Optional[Coords] = atom.coords

643 if not coords:

644 raise ValueError("Atom " + str(atom) + " has no coordinates")

645 else:

646 x_coord, y_coord, z_coord = [

647 "{:.3f}".format(coord).rjust(8) for coord in coords

648 ]

649 occupancy = "1.00" # Just a placeholder

650 temp_factor = "0.00" # Just a placeholder

651 element = atom.element

652 atom_line = (

653 "ATOM " + str(index) + " " + str(name) + " " + residue_name + chain + residue_number + icode + " " + x_coord + y_coord + z_coord + " " + occupancy + " " + temp_factor + " " + element

654 ).ljust(80) + "\n"

655 file.write(atom_line)

656

657 # Get a chain by its name

658 def get_chain_by_name(self, name: str) -> Optional[Chain]:

659 return next((c for c in self.chains if c.name == name), None)

660

661 # Get a summary of the structure

662 def display_summary(self):

663 print("Atoms: " + str(len(self.atoms)))

664 print("Residues: " + str(len(self.residues)))

665 print("Chains: " + str(len(self.chains)))

666 for chain in self.chains:

667 print(

668 "Chain " + str(chain.name) + " (" + str(len(chain.residue_indices)) + " residues)"

669 )

670 print(" -> " + chain.get_sequence())

671

672 # This is an alternative system to find protein chains(anything else is chained as 'X')

673 # This system does not depend on VMD

674 # It totally overrides previous chains since it is expected to be used only when chains are missing

675 def raw_protein_chainer(self):

676 current_chain = self.get_next_available_chain_name()

677 previous_alpha_carbon = None

678 for residue in self.residues:

679 alpha_carbon = next(

680 (atom for atom in residue.atoms if atom.name == "CA"), None

681 )

682 if not alpha_carbon:

683 residue.set_chain("X")

684 continue

685 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms

686 residues_are_connected = (

687 previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4

688 )

689 if not residues_are_connected:

690 current_chain = self.get_next_available_chain_name()

691 if current_chain:

692 residue.set_chain(current_chain)

693 previous_alpha_carbon = alpha_carbon

694

695 # Get the next available chain name

696 # Find alphabetically the first letter which is not yet used as a chain name

697 # If all letters in the alphabet are used already then return None

698 def get_next_available_chain_name(self) -> Optional[str]:

699 current_chain_names = [chain.name for chain in self.chains]

700 return next(

701 (name for name in available_caps if name not in current_chain_names), None

702 )

703

704

705# Calculate the distance between two atoms

706def calculate_distance(atom_1: Atom, atom_2: Atom) -> float:

707 squared_distances_sum = 0.0

708 if atom_1.coords is not None and atom_2.coords is not None:

709 for i in {"x": 0, "y": 1, "z": 2}.values():

710 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i]) ** 2

711 return math.sqrt(squared_distances_sum)

712

713

714# Set all available chains according to pdb standards

715available_caps = [

716 "A",

717 "B",

718 "C",

719 "D",

720 "E",

721 "F",

722 "G",

723 "H",

724 "I",

725 "J",

726 "K",

727 "L",

728 "M",

729 "N",

730 "O",

731 "P",

732 "Q",

733 "R",

734 "S",

735 "T",

736 "U",

737 "V",

738 "W",

739 "X",

740 "Y",

741 "Z",

742]

743

744# Protein residues

745aminoacids = {

746 "ALA": "A",

747 "ALAN": "A",

748 "ALAC": "A",

749 "ARG": "R",

750 "ARGN": "R",

751 "ARGC": "R",

752 "ASN": "N",

753 "ASNN": "N",

754 "ASNC": "N",

755 "ASP": "D",

756 "ASPN": "D",

757 "ASPC": "D",

758 "CYS": "C",

759 "CYSN": "C",

760 "CYSC": "C",

761 "CYH": "C",

762 "CSH": "C",

763 "CSS": "C",

764 "CYX": "C",

765 "CYP": "C",

766 "GLN": "Q",

767 "GLNN": "Q",

768 "GLNC": "Q",

769 "GLU": "E",

770 "GLUN": "E",

771 "GLUC": "E",

772 "GLY": "G",

773 "GLYN": "G",

774 "GLYC": "G",

775 "HIS": "H",

776 "HISN": "H",

777 "HISC": "H",

778 "HID": "H",

779 "HIE": "H",

780 "HIP": "H",

781 "HSD": "H",

782 "HSE": "H",

783 "ILE": "I",

784 "ILEN": "I",

785 "ILEC": "I",

786 "ILU": "I",

787 "LEU": "L",

788 "LEUN": "L",

789 "LEUC": "L",

790 "LYS": "K",

791 "LYSN": "K",

792 "LYSC": "K",

793 "MET": "M",

794 "METN": "M",

795 "METC": "M",

796 "PHE": "F",

797 "PHEN": "F",

798 "PHEC": "F",

799 "PRO": "P",

800 "PRON": "P",

801 "PROC": "P",

802 "PRØ": "P",

803 "PR0": "P",

804 "PRZ": "P",

805 "SER": "S",

806 "SERN": "S",

807 "SERC": "S",

808 "THR": "T",

809 "THRN": "T",

810 "THRC": "R",

811 "TRP": "W",

812 "TRPN": "W",

813 "TRPC": "W",

814 "TRY": "W",

815 "TYR": "Y",

816 "TYRN": "Y",

817 "TYRC": "Y",

818 "VAL": "V",

819 "VALN": "V",

820 "VALC": "V",

821}

822

823# Nucleic acid residues

824nucleotides = {

825 "A": "A",

826 "A3": "A",

827 "A5": "A",

828 "C": "C",

829 "C3": "C",

830 "C5": "C",

831 "T": "T",

832 "T3": "T",

833 "T5": "T",

834 "G": "G",

835 "G3": "G",

836 "G5": "G",

837 "U": "U",

838 "U3": "U",

839 "U5": "U",

840 "DA": "A",

841 "DT": "T",

842 "DC": "C",

843 "DG": "G",

844}

845

846# All supported residues

847supported_residues = {**aminoacids, **nucleotides}

848

849

850# Transform a residue name to its equivalent single letter code

851# If the residue name is not recognized then return "X"

852# e.g. "ARG" -> "R", "WTF" -> "X"

853# You can choose which residue types are targeted(e.g. aminoacids only)

854# Options are: 'all', 'aminoacids' or 'nucleotides'

855# All residue types are targeted by default

856def residue_name_2_letter(residue_name: str, residue_types: str = "all") -> str:

857 # Set the target residues

858 if residue_types == "all":

859 target_residues = supported_residues

860 elif residue_types == "aminoacids":

861 target_residues = aminoacids

862 elif residue_types == "nucleotides":

863 target_residues = nucleotides

864 else:

865 raise ValueError("Unrecognized residue types " + str(residue_types))

866 # Now find the corresponding letter among the target residues

867 ref = target_residues.get(residue_name, "X")

868 return ref

869

870

871# Map the structure aminoacids sequences against the standard reference sequences

872# References are uniprot accession ids and they are optional

873# For each reference, align the reference sequence with the topology sequence

874# Chains which do not match any reference sequence will be blasted

875# Note that an internet connection is required both to retireve the uniprot reference sequence and to do the blast

876# NEVER FORGET: This system relies on the fact that topology chains are not repeated

877def generate_map_online(

878 structure: "Structure", forced_references: list[str] = []

879) -> Optional[dict]:

880 # Store all the references which are got through this process

881 # Note that not all references may be used at the end

882 references = {}

883 # Get the structure chain sequences

884 structure_sequences = get_chain_sequences(structure)

885 # Find out which chains are protein

886 protein_sequences = []

887 for structure_sequence in structure_sequences:

888 sequence = structure_sequence["sequence"]

889 if next((letter for letter in sequence if letter != "X"), None):

890 structure_sequence["match"] = {"ref": None, "map": None, "score": 0}

891 protein_sequences.append(structure_sequence)

892 # For each input forced reference, get the reference sequence

893 reference_sequences = {}

894 if forced_references:

895 # Check forced_references is not a string

896 if isinstance(forced_references, str):

897 raise TypeError(

898 "Forced references should be a list and not a string a this point"

899 )

900 for uniprot_id in forced_references:

901 reference = get_uniprot_reference(uniprot_id)

902 reference_sequences[reference["uniprot"]] = reference["sequence"]

903 # Save the current whole reference object for later

904 references[reference["uniprot"]] = reference

905

906 # Try to match all protein sequences with the available reference sequences

907 # In case of match, objects in the 'protein_sequences' list are modified by adding the result

908 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not

909 def match_sequences() -> bool:

910 # Track each chain-reference alignment match and keep the score of successful alignments

911 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)

912 for structure_sequence in protein_sequences:

913 for uniprot_id, reference_sequence in reference_sequences.items():

914 # Align the structure sequence with the reference sequence

915 align_results = align(

916 reference_sequence, structure_sequence["sequence"]

917 )

918 if not align_results:

919 continue

920 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)

921 sequence_map, align_score = align_results

922 current_reference = structure_sequence["match"]

923 if current_reference["score"] > align_score:

924 continue

925 reference = references[uniprot_id]

926 # If the alignment is better then we impose the new reference

927 structure_sequence["match"] = {

928 "ref": reference,

929 "map": sequence_map,

930 "score": align_score,

931 }

932 # Sum up the current matching

933 print("Reference summary:")

934 for structure_sequence in structure_sequences:

935 name = structure_sequence["name"]

936 match = structure_sequence.get("match", None)

937 if not match:

938 print(" " + name + " -> Not protein")

939 continue

940 reference = structure_sequence["match"].get("ref", None)

941 if not reference:

942 print(" " + name + " -> ¿?")

943 continue

944 uniprot_id = reference["uniprot"]

945 print(" " + name + " -> " + uniprot_id)

946 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not

947 return all(

948 [

949 structure_sequence["match"]["ref"]

950 for structure_sequence in protein_sequences

951 ]

952 )

953

954 # If we have every protein chain matched with a reference already then we stop here

955 if match_sequences():

956 return format_topology_data(structure, protein_sequences)

957 # If there are still any chain which is not matched with a reference then we need more references

958 # To get them, we run a blast with each orphan chain sequence

959 for structure_sequence in protein_sequences:

960 # Skip already references chains

961 if structure_sequence["match"]["ref"]:

962 continue

963 # Run the blast

964 sequence = structure_sequence["sequence"]

965 # uniprot_id = blast(sequence)[0]

966 uniprot_id = blast(sequence) # type: ignore

967 # Build a new reference from the resulting uniprot

968 reference = get_uniprot_reference(uniprot_id) # type: ignore

969 reference_sequences[reference["uniprot"]] = reference["sequence"]

970 # Save the current whole reference object for later

971 references[reference["uniprot"]] = reference

972 # If we have every protein chain matched with a reference already then we stop here

973 if match_sequences():

974 return format_topology_data(structure, protein_sequences)

975 print(

976 "The BLAST failed to find a matching reference sequence for at least one protein sequence"

977 )

978 return None

979

980

981# Try to match all protein sequences with the available reference sequences

982# In case of match, objects in the 'protein_sequences' list are modified by adding the result

983# Finally, return True if all protein sequences were matched with the available reference sequences or False if not

984# def match_sequences(protein_sequences: list, reference_sequences: dict) -> bool:

985# # Track each chain-reference alignment match and keep the score of successful alignments

986# # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)

987# for structure_sequence in protein_sequences:

988# for uniprot_id, reference_sequence in reference_sequences.items():

989# # Align the structure sequence with the reference sequence

990# align_results = align(reference_sequence, structure_sequence['sequence'])

991# if not align_results:

992# continue

993# # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)

994# sequence_map, align_score = align_results

995# current_reference = structure_sequence['match']

996# if current_reference['score'] > align_score:

997# continue

998# reference = references[uniprot_id] # type: ignore

999

1000# # If the alignment is better then we impose the new reference

1001# structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score}

1002# # Finally, return True if all protein sequences were matched with the available reference sequences or False if not

1003# return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences])

1004

1005

1006# Reformat mapping data to the topology system(introduced later)

1007def format_topology_data(structure: "Structure", mapping_data: list) -> dict:

1008 # Get the count of residues from the structure

1009 residues_count = len(structure.residues)

1010 # Now format data

1011 reference_ids = []

1012 residue_reference_indices: list = [None] * residues_count

1013 residue_reference_numbers = [None] * residues_count

1014 for data in mapping_data:

1015 match = data["match"]

1016 # Get the reference index

1017 # Note that several matches may belong to the same reference and thus have the same index

1018 reference = match["ref"]

1019 uniprot_id = reference["uniprot"]

1020 if uniprot_id not in reference_ids:

1021 reference_ids.append(reference["uniprot"])

1022 reference_index = reference_ids.index(uniprot_id)

1023 residue_indices = data["residue_indices"]

1024 for r, residue_number in enumerate(match["map"]):

1025 if residue_number is None:

1026 continue

1027 residue_index = residue_indices[r]

1028 residue_reference_indices[residue_index] = reference_index

1029 residue_reference_numbers[residue_index] = residue_number

1030 # If there are not references at the end then set all fields as None, in order to save space

1031 if len(reference_ids) == 0:

1032 reference_ids = []

1033 residue_reference_indices = []

1034 residue_reference_numbers = []

1035 # Return the 3 topology fields as they are in the database

1036 residues_map = {

1037 "references": reference_ids,

1038 "residue_reference_indices": residue_reference_indices,

1039 "residue_reference_numbers": residue_reference_numbers,

1040 }

1041 return residues_map

1042

1043

1044# Align a reference aminoacid sequence with each chain sequence in a topology

1045# NEVER FORGET: This system relies on the fact that topology chains are not repeated

1046def map_sequence(ref_sequence: str, structure: "Structure") -> list:

1047 sequences = get_chain_sequences(structure)

1048 mapping: list = []

1049 for s in sequences:

1050 sequence = sequences[s]

1051 sequence_map = align(ref_sequence, sequence)

1052 if sequence_map:

1053 mapping += sequence_map

1054 return mapping

1055

1056

1057# Get each chain name and aminoacids sequence in a topology

1058# Output format example: [{'sequence': 'VNLTT', 'indices': [1, 2, 3, 4, 5]}, ...]

1059def get_chain_sequences(structure: "Structure") -> list:

1060 sequences = []

1061 chains = structure.chains

1062 for chain in chains:

1063 name = chain.name

1064 sequence = ""

1065 residue_indices = []

1066 for residue in chain.residues:

1067 letter = residue_name_2_letter(residue.name, "aminoacids")

1068 sequence += letter

1069 residue_indices.append(residue.index)

1070 # Save sequences by chain name (i.e. chain id or chain letter)

1071 sequence_object = {

1072 "name": name,

1073 "sequence": sequence,

1074 "residue_indices": residue_indices,

1075 }

1076 sequences.append(sequence_object)

1077 return sequences

1078

1079

1080# Align two aminoacid sequences

1081# Return a list with the reference residue indexes (values)

1082# which match each new sequence residues indexes (indexes)

1083# Return also the score of the alignment

1084# Return None when there is not valid alignment at all

1085def align(ref_sequence: str, new_sequence: str) -> Optional[tuple[list, float]]:

1086 # print('- REFERENCE\n' + ref_sequence + '\n- NEW\n' + new_sequence)

1087

1088 # If the new sequence is all 'X' stop here, since this would make the alignment infinite

1089 # Then an array filled with None is returned

1090 if all([letter == "X" for letter in new_sequence]):

1091 return None

1092

1093 # Return the new sequence as best aligned as possible with the reference sequence

1094 alignments = pairwise2.align.localds( # type: ignore

1095 ref_sequence, new_sequence, blosum62, -10, -0.5

1096 )

1097 # DANI: Habría que hacerlo de esta otra forma según el deprecation warning (arriba hay más código)

1098 # DANI: El problema es que el output lo tiene todo menos la sequencia en formato alienada

1099 # DANI: i.e. formato '----VNLTT', que es justo el que necesito

1100 # alignments = aligner.align(ref_sequence, new_sequence)

1101

1102 # In case there are no alignments it means the current chain has nothing to do with this reference

1103 # Then an array filled with None is returned

1104 if len(alignments) == 0:

1105 return None

1106

1107 # Several alignments may be returned, specially when it is a difficult or impossible alignment

1108 # Output format example: '----VNLTT'

1109 best_alignment = alignments[0]

1110 aligned_sequence = best_alignment[1]

1111 print(format_alignment(*alignments[0]))

1112 score = alignments[0][2]

1113 # WARNING: Do not use 'aligned_sequence' length here since it has the total sequence length

1114 normalized_score = score / len(new_sequence)

1115 print("Normalized score: " + str(normalized_score))

1116

1117 # If the normalized score does not reaches the minimum we consider the alignment is not valid

1118 # It may happen when the reference goes for a specific chain but we must map all chains

1119 # This 1 has been found experimentally

1120 # Non maching sequence may return a 0.1-0.3 normalized score

1121 # Matching sequence may return >4 normalized score

1122 if normalized_score < 1:

1123 print("Not valid alignment")

1124 return None

1125

1126 # Match each residue

1127 aligned_mapping: list = []

1128 aligned_index = 0

1129 for l_index, letter in enumerate(aligned_sequence):

1130 # Guions are skipped

1131 if letter == "-":

1132 continue

1133 # Get the current residue of the new sequence

1134 equivalent_letter = new_sequence[aligned_index]

1135 if not letter == equivalent_letter:

1136 raise SystemExit("Something was wrong:S")

1137 # 'X' residues cannot be mapped since reference sequences should never have any 'X'

1138 if letter == "X":

1139 aligned_mapping.append(None)

1140 aligned_index += 1

1141 continue

1142 # Otherwise add the equivalent aligned index to the mapping

1143 # WARNING: Add +1 since uniprot residue counts start at 1, not 0

1144 aligned_mapping.append(l_index + 1)

1145 aligned_index += 1

1146

1147 return aligned_mapping, normalized_score

1148

1149

1150# Given an aminoacids sequence, return a list of uniprot ids

1151# Note that we are blasting against UniProtKB / Swiss-Prot so results will always be valid UniProt accessions

1152# WARNING: This always means results will correspond to curated entries only

1153# If your sequence is from an exotic organism the result may be not from it but from other more studied organism

1154def blast(sequence: str) -> list[str]:

1155 print("Throwing blast... (this may take a few minutes)")

1156 result = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence)

1157 parsed_result = xmltodict.parse(result.read())

1158 hits = parsed_result["BlastOutput"]["BlastOutput_iterations"]["Iteration"][

1159 "Iteration_hits"

1160 ]["Hit"]

1161 # Get the first result only

1162 result = hits[0]

1163 # Return the accession

1164 # DANI: Si algun día tienes problemas porque te falta el '.1' al final del accession puedes sacarlo de Hit_id

1165 accession = result["Hit_accession"]

1166 print("Result: " + accession)

1167 print(result["Hit_def"])

1168 return accession

1169

1170

1171# Given a uniprot accession, use the uniprot API to request its data and then mine what is needed for the database

1172def get_uniprot_reference(uniprot_accession: str) -> dict:

1173 # Request Uniprot

1174 request_url = "https://www.ebi.ac.uk/proteins/api/proteins/" + uniprot_accession

1175 try:

1176 with urllib.request.urlopen(request_url) as response:

1177 parsed_response = json.loads(response.read().decode("utf-8"))

1178 # If the accession is not found in UniProt then the id is not valid

1179 except urllib.error.HTTPError as error: # type: ignore

1180 if error.code == 400:

1181 raise ValueError(

1182 "Something went wrong with the Uniprot request: " + request_url

1183 )

1184 # Get the aminoacids sequence

1185 sequence = parsed_response["sequence"]["sequence"]

1186 return {"uniprot": uniprot_accession, "sequence": sequence}

Coverage for biobb_model/model/fix_pdb_utils.py: 64%

530 statements