Coverage for biobb_model/model/fix_pdb_utils.py: 64%
530 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 11:32 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-28 11:32 +0000
1import json
2import math
3import os
4import urllib.request
5from bisect import bisect
6from typing import Optional, Union
8import xmltodict # type: ignore
9from Bio import pairwise2 # type: ignore
10from Bio.Blast import NCBIWWW # type: ignore
11from Bio.pairwise2 import format_alignment # type: ignore
13Coords = tuple[float, float, float]
15try:
16 from Bio.SubsMat import MatrixInfo # type: ignore
18 blosum62 = MatrixInfo.blosum62
19except ImportError:
20 from Bio.Align import substitution_matrices # type: ignore
22 blosum62 = substitution_matrices.load("BLOSUM62")
25# An atom
26class Atom:
27 def __init__(
28 self,
29 name: Optional[str] = None,
30 element: Optional[str] = None,
31 coords: Optional[Coords] = None,
32 ):
33 self.name = name
34 self.element = element
35 self.coords = coords
36 # Set variables to store references to other related instances
37 # These variables will be set further by the structure
38 self._structure: Optional[Structure] = None
39 self._index: Optional[int] = None
40 self._residue_index: Optional[int] = None
42 def __repr__(self):
43 return "<Atom " + str(self.name) + ">"
45 def __eq__(self, other):
46 return self._residue_index == other._residue_index and self.name == other.name
48 # The parent structure (read only)
49 # This value is set by the structure itself
50 def get_structure(self):
51 return self._structure
53 structure = property(get_structure, None, None, "The parent structure (read only)")
55 # The residue index according to parent structure residues (read only)
56 # This value is set by the structure itself
57 def get_index(self):
58 return self._index
60 index = property(
61 get_index,
62 None,
63 None,
64 "The residue index according to parent structure residues (read only)",
65 )
67 # The atom residue index according to parent structure residues
68 # If residue index is set then make changes in all the structure to make this change coherent
69 def get_residue_index(self) -> Optional[int]:
70 return self._residue_index
72 def set_residue_index(self, new_residue_index: int):
73 # If there is not strucutre yet it means the residue is beeing set before the structure
74 # We just save the residue index and wait for the structure to be set
75 if not self.structure:
76 self._residue_index = new_residue_index
77 return
78 # Relational indices are updated through a top-down hierarchy
79 # Affected residues are the ones to update this atom internal residue index
80 current_residue = self.residue
81 current_residue.remove_atom(self)
82 new_residue = self.structure.residues[new_residue_index]
83 new_residue.add_atom(self)
85 residue_index = property(
86 get_residue_index,
87 set_residue_index,
88 None,
89 "The atom residue index according to parent structure residues",
90 )
92 # The atom residue
93 # If residue is set then make changes in all the structure to make this change coherent
94 def get_residue(self) -> Optional["Residue"]:
95 # If there is not strucutre yet it means the atom is beeing set before the structure
96 # In this case it is not possible to get related residues in the structure
97 if not self.structure:
98 return None
99 # Get the residue in the structure according to the residue index
100 return self.structure.residues[self.residue_index]
102 def set_residue(self, new_residue: "Residue"):
103 # Find the new residue index and set it as the atom residue index
104 # Note that the residue must be set in the structure already
105 new_residue_index = new_residue.index
106 if new_residue_index is None:
107 raise ValueError(
108 "Residue " + str(new_residue) + " is not set in the structure"
109 )
110 self.set_residue_index(new_residue_index)
112 residue = property(get_residue, set_residue, None, "The atom residue")
114 # The atom chain index according to parent structure chains (read only)
115 # In order to change the chain index it must be changed in the atom residue
116 def get_chain_index(self) -> int:
117 return self.residue.chain_index
119 chain_index = property(
120 get_chain_index,
121 None,
122 None,
123 "The atom chain index according to parent structure chains (read only)",
124 )
126 # The atom chain(read only)
127 # In order to change the chain it must be changed in the atom residue
128 def get_chain(self) -> "Chain":
129 # Get the chain in the structure according to the chain index
130 return self.structure.chains[self.chain_index]
132 chain = property(get_chain, None, None, "The atom chain (read only)")
135# A residue
136class Residue:
137 def __init__(
138 self,
139 name: Optional[str] = None,
140 number: Optional[int] = None,
141 icode: Optional[str] = None,
142 ):
143 self.name = name
144 self.number = number
145 self.icode = icode
146 # Set variables to store references to other related instaces
147 # These variables will be set further by the structure
148 self._structure: Optional[Structure] = None
149 self._index: Optional[int] = None
150 self._atom_indices: list[int] = []
151 self._chain_index: Optional[int] = None
153 def __repr__(self):
154 return (
155 "<Residue " + str(self.name) + str(self.number) + (self.icode if self.icode else "") + ">"
156 )
158 def __eq__(self, other):
159 if not isinstance(self, type(other)):
160 return False
161 return (
162 self._chain_index == other._chain_index and self.number == other.number and self.icode == other.icode
163 )
165 def __hash__(self):
166 # WARNING: This is susceptible to duplicated residues
167 return hash((self._chain_index, self.number, self.icode))
168 # WARNING: This is not susceptible to duplicated residues
169 # return hash(tuple(self._atom_indices))
171 def same_inputs_as(self, other) -> bool:
172 return (
173 self.name == other.name and self.number == other.number and self.icode == other.icode
174 )
176 # The parent structure (read only)
177 # This value is set by the structure itself
178 def get_structure(self):
179 return self._structure
181 structure = property(get_structure, None, None, "The parent structure (read only)")
183 # The residue index according to parent structure residues (read only)
184 # This value is set by the structure itself
185 def get_index(self):
186 return self._index
188 index = property(
189 get_index,
190 None,
191 None,
192 "The residue index according to parent structure residues(read only)",
193 )
195 # The atom indices according to parent structure atoms for atoms in this residue
196 # If atom indices are set then make changes in all the structure to make this change coherent
197 def get_atom_indices(self) -> list[int]:
198 return self._atom_indices
200 def set_atom_indices(self, new_atom_indices: list[int]):
201 # If there is not strucutre yet it means the residue is beeing set before the structure
202 # We just save atom indices and wait for the structure to be set
203 if not self.structure:
204 self._atom_indices = new_atom_indices
205 return
206 # Update the current atoms
207 for atom in self.atoms:
208 atom._residue_index = None
209 # Update the new atoms
210 for index in new_atom_indices:
211 atom = self.structure.atoms[index]
212 atom._residue_index = self.index
213 # Now new indices are coherent and thus we can save them
214 self._atom_indices = new_atom_indices
216 atom_indices = property(
217 get_atom_indices,
218 set_atom_indices,
219 None,
220 "The atom indices according to parent structure atoms for atoms in this residue",
221 )
223 # The atoms in this residue
224 # If atoms are set then make changes in all the structure to make this change coherent
225 def get_atoms(self) -> list["Atom"]:
226 # If there is not strucutre yet it means the chain is beeing set before the structure
227 # In this case it is not possible to get related atoms in the structure
228 if not self.structure:
229 return []
230 # Get atoms in the structure according to atom indices
231 atoms = self.structure.atoms
232 return [atoms[atom_index] for atom_index in self.atom_indices]
234 def set_atoms(self, new_atoms: list["Atom"]):
235 # Find indices for new atoms and set their indices as the new atom indices
236 # Note that atoms must be set in the structure already
237 new_atom_indices = []
238 for new_atom in new_atoms:
239 new_atom_index = new_atom.index
240 if new_atom_index is None:
241 raise ValueError(
242 "Atom " + str(new_atom) + " is not set in the structure"
243 )
244 new_atom_indices.append(new_atom_index)
245 self.set_atom_indices(new_atom_indices)
247 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue")
249 # Add an atom to the residue
250 def add_atom(self, new_atom: "Atom"):
251 # Insert the new atom index in the list of atom indices keeping the order
252 new_atom_index = new_atom.index
253 sorted_atom_index = bisect(self.atom_indices, new_atom_index)
254 self.atom_indices.insert(sorted_atom_index, new_atom_index)
255 # Update the atom internal index
256 new_atom._residue_index = self.index
258 # Remove an atom from the residue
259 def remove_atom(self, current_atom: "Atom"):
260 # Remove the current atom index from the atom indices list
261 self.atom_indices.remove(current_atom.index) # This index MUST be in the list
262 # Update the atom internal index
263 current_atom._residue_index = None
265 # The residue chain index according to parent structure chains
266 # If chain index is set then make changes in all the structure to make this change coherent
267 def get_chain_index(self) -> Optional[int]:
268 return self._chain_index
270 def set_chain_index(self, new_chain_index: int):
271 # If there is not strucutre yet it means the chain is beeing set before the structure
272 # We just save the chain index and wait for the structure to be set
273 if not self.structure:
274 self._chain_index = new_chain_index
275 return
276 # Relational indices are updated through a top-down hierarchy
277 # Affected chains are the ones to update this residue internal chain index
278 # WARNING: It is critical to find both current and new chains before removing/adding residues
279 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged
280 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed
281 current_chain = self.chain
282 new_chain = self.structure.chains[new_chain_index]
283 current_chain.remove_residue(self)
284 new_chain.add_residue(self)
286 chain_index = property(
287 get_chain_index,
288 set_chain_index,
289 None,
290 "The residue chain index according to parent structure chains",
291 )
293 # The residue chain
294 # If chain is set then make changes in all the structure to make this change coherent
295 def get_chain(self) -> Union["Chain", list]:
296 # If there is not strucutre yet it means the residue is beeing set before the structure
297 # In this case it is not possible to get related chain in the structure
298 if not self.structure:
299 return []
300 # Get the chain in the structure according to the chain index
301 return self.structure.chains[self.chain_index]
303 def set_chain(self, new_chain: Union["Chain", str]):
304 # In case the chain is just a string we must find/create the corresponding chain
305 if isinstance(new_chain, str):
306 letter = new_chain
307 # Get the residue structure
308 structure = self.structure
309 if not structure:
310 raise ValueError(
311 "Cannot find the corresponding " + new_chain + " chain without the structure"
312 )
313 # Find if the letter belongs to an already existing chain
314 new_chain = structure.get_chain_by_name(letter)
315 # If the chain does not exist yet then create it
316 if not new_chain:
317 new_chain = Chain(name=letter)
318 structure.set_new_chain(new_chain)
319 # Find the new chain index and set it as the residue chain index
320 # Note that the chain must be set in the structure already
321 new_chain_index = new_chain.index
322 if new_chain_index is None:
323 raise ValueError("Chain " + str(new_chain) + " is not set in the structure")
324 if isinstance(new_chain_index, int):
325 self.set_chain_index(new_chain_index)
327 chain = property(get_chain, set_chain, None, "The residue chain")
330# A chain
331class Chain:
332 def __init__(self, name: Optional[str] = None):
333 self.name = name
334 # Set variables to store references to other related instaces
335 # These variables will be set further by the structure
336 self._structure: Optional[Structure] = None
337 self._index: Optional[int] = None
338 self.residue_indices = []
340 def __repr__(self):
341 return "<Chain " + str(self.name) + ">"
343 def __eq__(self, other):
344 return self.name == other.name
346 # The parent structure(read only)
347 # This value is set by the structure itself
348 def get_structure(self):
349 return self._structure
351 structure = property(get_structure, None, None, "The parent structure(read only)")
353 # The residue index according to parent structure residues(read only)
354 # This value is set by the structure itself
355 def get_index(self):
356 return self._index
358 # When the index is set all residues are updated with the nex chain index
359 def set_index(self, index: int):
360 for residue in self.residues:
361 residue._chain_index = index
362 self._index = index
364 index = property(
365 get_index,
366 set_index,
367 None,
368 "The residue index according to parent structure residues(read only)",
369 )
371 # The residue indices according to parent structure residues for residues in this chain
372 # If residue indices are set then make changes in all the structure to make this change coherent
373 def get_residue_indices(self) -> list[int]:
374 return self._residue_indices
376 def set_residue_indices(self, new_residue_indices: list[int]):
377 # If there is not strucutre yet it means the chain is beeing set before the structure
378 # We just save residue indices and wait for the structure to be set
379 if not self.structure:
380 self._residue_indices = new_residue_indices
381 return
382 # Update the current residues
383 for residue in self.residues:
384 residue._chain_index = None
385 # Update the new residues
386 for index in new_residue_indices:
387 residue = self.structure.residues[index]
388 residue._chain_index = self.index
389 # In case the new residue indices list is empty this chain must be removed from its structure
390 if len(new_residue_indices) == 0:
391 self.structure.purge_chain(self)
392 # Now new indices are coherent and thus we can save them
393 self._residue_indices = new_residue_indices
395 residue_indices = property(
396 get_residue_indices,
397 set_residue_indices,
398 None,
399 "The residue indices according to parent structure residues for residues in this residue",
400 )
402 # The residues in this chain
403 # If residues are set then make changes in all the structure to make this change coherent
404 def get_residues(self) -> list["Residue"]:
405 # If there is not strucutre yet it means the chain is beeing set before the structure
406 # In this case it is not possible to get related residues in the structure
407 if not self.structure:
408 return []
409 # Get residues in the structure according to residue indices
410 residues = self.structure.residues
411 return [residues[residue_index] for residue_index in self.residue_indices]
413 def set_residues(self, new_residues: list["Residue"]):
414 # Find indices for new residues and set their indices as the new residue indices
415 # Note that residues must be set in the structure already
416 new_residue_indices = []
417 for new_residue in new_residues:
418 new_residue_index = new_residue.index
419 if new_residue_index is None:
420 raise ValueError(
421 "Residue " + str(new_residue) + " is not set in the structure"
422 )
423 new_residue_indices.append(new_residue_index)
424 self.set_residue_indices(new_residue_indices)
426 residues = property(get_residues, set_residues, None, "The residues in this chain")
428 # Add a residue to the chain
429 def add_residue(self, residue: "Residue"):
430 # Insert the new residue index in the list of residue indices keeping the order
431 sorted_residue_index = bisect(self.residue_indices, residue.index)
432 self.residue_indices.insert(sorted_residue_index, residue.index)
433 # Update the residue internal chain index
434 residue._chain_index = self.index
436 # Remove a residue from the chain
437 # WARNING: Note that this function does not trigger the set_residue_indices
438 def remove_residue(self, residue: "Residue"):
439 self.residue_indices.remove(residue.index) # This index MUST be in the list
440 # If we removed the last index then this chain must be removed from its structure
441 if len(self.residue_indices) == 0 and self.structure:
442 self.structure.purge_chain(self)
443 # Update the residue internal chain index
444 residue._chain_index = None
446 # Atom indices for all atoms in the chain(read only)
447 # In order to change atom indices they must be changed in their corresponding residues
448 def get_atom_indices(self) -> list[int]:
449 return sum([residue.atom_indices for residue in self.residues], [])
451 atom_indices = property(
452 get_atom_indices,
453 None,
454 None,
455 "Atom indices for all atoms in the chain(read only)",
456 )
458 # Atoms in the chain(read only)
459 # In order to change atoms they must be changed in their corresponding residues
460 def get_atoms(self) -> list[int]:
461 return sum([residue.atoms for residue in self.residues], [])
463 atoms = property(get_atoms, None, None, "Atoms in the chain(read only)")
465 # Get the residues sequence in one-letter code
466 def get_sequence(self) -> str:
467 return "".join(
468 [residue_name_2_letter(residue.name) for residue in self.residues]
469 )
472# A structure is a group of atoms organized in chains and residues
473class Structure:
474 def __init__(
475 self,
476 atoms: list["Atom"] = [],
477 residues: list["Residue"] = [],
478 chains: list["Chain"] = [],
479 ):
480 self.atoms: list[Atom] = []
481 self.residues: list[Residue] = []
482 self.chains: list[Chain] = []
483 # Set references between instances
484 for atom in atoms:
485 self.set_new_atom(atom)
486 for residue in residues:
487 self.set_new_residue(residue)
488 for chain in chains:
489 self.set_new_chain(chain)
491 def __repr__(self):
492 return "<Structure(" + str(len(self.atoms)) + " atoms)>"
494 # Set a new atom in the structure
495 def set_new_atom(self, atom: "Atom"):
496 atom._structure = self
497 new_atom_index = len(self.atoms)
498 self.atoms.append(atom)
499 atom._index = new_atom_index
501 # Set a new residue in the structure
502 # WARNING: Atoms must be set already before setting residues
503 def set_new_residue(self, residue: "Residue"):
504 residue._structure = self
505 new_residue_index = len(self.residues)
506 self.residues.append(residue)
507 residue._index = new_residue_index
508 # In case the residue has atom indices, set relational indices on each atom
509 for atom_index in residue.atom_indices:
510 atom = self.atoms[atom_index]
511 atom._residue_index = new_residue_index
513 # Set a new chain in the structure
514 # WARNING: Residues and atoms must be set already before setting chains
515 def set_new_chain(self, chain: "Chain"):
516 chain._structure = self
517 new_chain_index = len(self.chains)
518 self.chains.append(chain)
519 chain._index = new_chain_index
520 # In case the chain has residue indices, set relational indices on each residue
521 for residue_index in chain.residue_indices:
522 residue = self.residues[residue_index]
523 residue._chain_index = new_chain_index
525 # Purge chain from the structure
526 # This can be done only when the chain has no residues left in the structure
527 # Renumerate all chain indices which have been offsetted as a result of the purge
528 def purge_chain(self, chain: "Chain"):
529 # Check the chain can be purged
530 if chain not in self.chains:
531 raise ValueError(
532 "Chain " + str(chain.name) + " is not in the structure already"
533 )
534 if len(chain.residue_indices) > 0:
535 raise ValueError(
536 "Chain " + str(chain.name) + " is still having residues and thus it cannot be purged"
537 )
538 # Get the current index of the chain to be purged
539 purged_index = chain.index
540 # Chains and their residues below this index are not to be modified
541 # Chains and their residues over this index must be renumerated
542 for affected_chain in self.chains[purged_index + 1:]:
543 # Chaging the index automatically changes all chain residues '_chain_index' values.
544 affected_chain.index -= 1
545 # Finally, remove the current chain from the list of chains in the structure
546 self.chains.remove(chain)
548 # Set the structure from a pdb file
549 @classmethod
550 def from_pdb_file(cls, pdb_filename: str):
551 if not os.path.exists(pdb_filename):
552 raise SystemExit('File "' + pdb_filename + '" not found')
553 # Read the pdb file line by line and set the parsed atoms, residues and chains
554 parsed_atoms = []
555 parsed_residues: list[Residue] = []
556 parsed_chains: list[Chain] = []
557 atom_index = -1
558 residue_index = -1
559 with open(pdb_filename, "r") as file:
560 for line in file:
561 # Parse atoms only
562 start = line[0:6]
563 is_atom = start == "ATOM " or start == "HETATM"
564 if not is_atom:
565 continue
566 # Mine all atom data
567 atom_name = line[11:16].strip()
568 residue_name = line[17:21].strip()
569 chain = line[21:22]
570 residue_number = int(line[22:26])
571 icode = line[26:27]
572 if icode == " ":
573 icode = ""
574 x_coord = float(line[30:38])
575 y_coord = float(line[38:46])
576 z_coord = float(line[46:54])
577 element = line[77:79].strip()
578 # Set the parsed atom, residue and chain
579 parsed_atom = Atom(
580 name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)
581 )
582 parsed_residue = Residue(
583 name=residue_name, number=residue_number, icode=icode
584 )
585 parsed_chain = Chain(name=chain)
586 # Add the parsed atom to the list and update the current atom index
587 parsed_atoms.append(parsed_atom)
588 atom_index += 1
589 # Check if we are in the same chain/residue than before
590 same_chain = parsed_chains and parsed_chains[-1] == parsed_chain
591 same_residue = (
592 same_chain and parsed_residues and parsed_residue.same_inputs_as(parsed_residues[-1])
593 )
594 # Update the residue atom indices
595 # If the residue equals the last parsed residue then use the previous instead
596 if same_residue:
597 parsed_residue = parsed_residues[-1]
598 parsed_residue.atom_indices.append(atom_index)
599 # If it is the same residue then it will be the same chain as well so we can proceed
600 continue
601 # Otherwise, include the new residue in the list and update the current residue index
602 parsed_residues.append(parsed_residue)
603 residue_index += 1
604 parsed_residue.atom_indices.append(atom_index)
605 # If the chain equals the last parsed chain then use the previous instead
606 if same_chain:
607 parsed_chain = parsed_chains[-1]
608 parsed_chain.residue_indices.append(residue_index)
609 continue
610 # Otherwise, include the new chain in the list
611 parsed_chains.append(parsed_chain)
612 parsed_chain.residue_indices.append(residue_index)
613 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
615 # Fix atom elements by gueesing them when missing
616 # Set all elements with the first letter upper and the second(if any) lower
617 # def fix_atom_elements(self):
618 # for atom in self.atoms:
619 # # Make sure elements have the first letter cap and the second letter not cap
620 # if atom.element:
621 # atom.element = first_cap_only(atom.element)
622 # # If elements are missing then guess them from atom names
623 # else:
624 # atom.element = guess_name_element(atom.name)
626 # Generate a pdb file with current structure
627 def generate_pdb_file(self, pdb_filename: str):
628 with open(pdb_filename, "w") as file:
629 file.write("REMARK mdtoolbelt generated pdb file\n")
630 for a, atom in enumerate(self.atoms):
631 residue = atom.residue
632 index = str((a + 1) % 100000).rjust(5)
633 name = (
634 " " + str(atom.name).ljust(3)
635 if len(str(atom.name)) < 4
636 else atom.name
637 )
638 residue_name = residue.name.ljust(4)
639 chain = atom.chain.name.rjust(1)
640 residue_number = str(residue.number).rjust(4)
641 icode = residue.icode.rjust(1)
642 coords: Optional[Coords] = atom.coords
643 if not coords:
644 raise ValueError("Atom " + str(atom) + " has no coordinates")
645 else:
646 x_coord, y_coord, z_coord = [
647 "{:.3f}".format(coord).rjust(8) for coord in coords
648 ]
649 occupancy = "1.00" # Just a placeholder
650 temp_factor = "0.00" # Just a placeholder
651 element = atom.element
652 atom_line = (
653 "ATOM " + str(index) + " " + str(name) + " " + residue_name + chain + residue_number + icode + " " + x_coord + y_coord + z_coord + " " + occupancy + " " + temp_factor + " " + element
654 ).ljust(80) + "\n"
655 file.write(atom_line)
657 # Get a chain by its name
658 def get_chain_by_name(self, name: str) -> Optional[Chain]:
659 return next((c for c in self.chains if c.name == name), None)
661 # Get a summary of the structure
662 def display_summary(self):
663 print("Atoms: " + str(len(self.atoms)))
664 print("Residues: " + str(len(self.residues)))
665 print("Chains: " + str(len(self.chains)))
666 for chain in self.chains:
667 print(
668 "Chain " + str(chain.name) + " (" + str(len(chain.residue_indices)) + " residues)"
669 )
670 print(" -> " + chain.get_sequence())
672 # This is an alternative system to find protein chains(anything else is chained as 'X')
673 # This system does not depend on VMD
674 # It totally overrides previous chains since it is expected to be used only when chains are missing
675 def raw_protein_chainer(self):
676 current_chain = self.get_next_available_chain_name()
677 previous_alpha_carbon = None
678 for residue in self.residues:
679 alpha_carbon = next(
680 (atom for atom in residue.atoms if atom.name == "CA"), None
681 )
682 if not alpha_carbon:
683 residue.set_chain("X")
684 continue
685 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms
686 residues_are_connected = (
687 previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4
688 )
689 if not residues_are_connected:
690 current_chain = self.get_next_available_chain_name()
691 if current_chain:
692 residue.set_chain(current_chain)
693 previous_alpha_carbon = alpha_carbon
695 # Get the next available chain name
696 # Find alphabetically the first letter which is not yet used as a chain name
697 # If all letters in the alphabet are used already then return None
698 def get_next_available_chain_name(self) -> Optional[str]:
699 current_chain_names = [chain.name for chain in self.chains]
700 return next(
701 (name for name in available_caps if name not in current_chain_names), None
702 )
705# Calculate the distance between two atoms
706def calculate_distance(atom_1: Atom, atom_2: Atom) -> float:
707 squared_distances_sum = 0.0
708 if atom_1.coords is not None and atom_2.coords is not None:
709 for i in {"x": 0, "y": 1, "z": 2}.values():
710 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i]) ** 2
711 return math.sqrt(squared_distances_sum)
714# Set all available chains according to pdb standards
715available_caps = [
716 "A",
717 "B",
718 "C",
719 "D",
720 "E",
721 "F",
722 "G",
723 "H",
724 "I",
725 "J",
726 "K",
727 "L",
728 "M",
729 "N",
730 "O",
731 "P",
732 "Q",
733 "R",
734 "S",
735 "T",
736 "U",
737 "V",
738 "W",
739 "X",
740 "Y",
741 "Z",
742]
744# Protein residues
745aminoacids = {
746 "ALA": "A",
747 "ALAN": "A",
748 "ALAC": "A",
749 "ARG": "R",
750 "ARGN": "R",
751 "ARGC": "R",
752 "ASN": "N",
753 "ASNN": "N",
754 "ASNC": "N",
755 "ASP": "D",
756 "ASPN": "D",
757 "ASPC": "D",
758 "CYS": "C",
759 "CYSN": "C",
760 "CYSC": "C",
761 "CYH": "C",
762 "CSH": "C",
763 "CSS": "C",
764 "CYX": "C",
765 "CYP": "C",
766 "GLN": "Q",
767 "GLNN": "Q",
768 "GLNC": "Q",
769 "GLU": "E",
770 "GLUN": "E",
771 "GLUC": "E",
772 "GLY": "G",
773 "GLYN": "G",
774 "GLYC": "G",
775 "HIS": "H",
776 "HISN": "H",
777 "HISC": "H",
778 "HID": "H",
779 "HIE": "H",
780 "HIP": "H",
781 "HSD": "H",
782 "HSE": "H",
783 "ILE": "I",
784 "ILEN": "I",
785 "ILEC": "I",
786 "ILU": "I",
787 "LEU": "L",
788 "LEUN": "L",
789 "LEUC": "L",
790 "LYS": "K",
791 "LYSN": "K",
792 "LYSC": "K",
793 "MET": "M",
794 "METN": "M",
795 "METC": "M",
796 "PHE": "F",
797 "PHEN": "F",
798 "PHEC": "F",
799 "PRO": "P",
800 "PRON": "P",
801 "PROC": "P",
802 "PRØ": "P",
803 "PR0": "P",
804 "PRZ": "P",
805 "SER": "S",
806 "SERN": "S",
807 "SERC": "S",
808 "THR": "T",
809 "THRN": "T",
810 "THRC": "R",
811 "TRP": "W",
812 "TRPN": "W",
813 "TRPC": "W",
814 "TRY": "W",
815 "TYR": "Y",
816 "TYRN": "Y",
817 "TYRC": "Y",
818 "VAL": "V",
819 "VALN": "V",
820 "VALC": "V",
821}
823# Nucleic acid residues
824nucleotides = {
825 "A": "A",
826 "A3": "A",
827 "A5": "A",
828 "C": "C",
829 "C3": "C",
830 "C5": "C",
831 "T": "T",
832 "T3": "T",
833 "T5": "T",
834 "G": "G",
835 "G3": "G",
836 "G5": "G",
837 "U": "U",
838 "U3": "U",
839 "U5": "U",
840 "DA": "A",
841 "DT": "T",
842 "DC": "C",
843 "DG": "G",
844}
846# All supported residues
847supported_residues = {**aminoacids, **nucleotides}
850# Transform a residue name to its equivalent single letter code
851# If the residue name is not recognized then return "X"
852# e.g. "ARG" -> "R", "WTF" -> "X"
853# You can choose which residue types are targeted(e.g. aminoacids only)
854# Options are: 'all', 'aminoacids' or 'nucleotides'
855# All residue types are targeted by default
856def residue_name_2_letter(residue_name: str, residue_types: str = "all") -> str:
857 # Set the target residues
858 if residue_types == "all":
859 target_residues = supported_residues
860 elif residue_types == "aminoacids":
861 target_residues = aminoacids
862 elif residue_types == "nucleotides":
863 target_residues = nucleotides
864 else:
865 raise ValueError("Unrecognized residue types " + str(residue_types))
866 # Now find the corresponding letter among the target residues
867 ref = target_residues.get(residue_name, "X")
868 return ref
871# Map the structure aminoacids sequences against the standard reference sequences
872# References are uniprot accession ids and they are optional
873# For each reference, align the reference sequence with the topology sequence
874# Chains which do not match any reference sequence will be blasted
875# Note that an internet connection is required both to retireve the uniprot reference sequence and to do the blast
876# NEVER FORGET: This system relies on the fact that topology chains are not repeated
877def generate_map_online(
878 structure: "Structure", forced_references: list[str] = []
879) -> Optional[dict]:
880 # Store all the references which are got through this process
881 # Note that not all references may be used at the end
882 references = {}
883 # Get the structure chain sequences
884 structure_sequences = get_chain_sequences(structure)
885 # Find out which chains are protein
886 protein_sequences = []
887 for structure_sequence in structure_sequences:
888 sequence = structure_sequence["sequence"]
889 if next((letter for letter in sequence if letter != "X"), None):
890 structure_sequence["match"] = {"ref": None, "map": None, "score": 0}
891 protein_sequences.append(structure_sequence)
892 # For each input forced reference, get the reference sequence
893 reference_sequences = {}
894 if forced_references:
895 # Check forced_references is not a string
896 if isinstance(forced_references, str):
897 raise TypeError(
898 "Forced references should be a list and not a string a this point"
899 )
900 for uniprot_id in forced_references:
901 reference = get_uniprot_reference(uniprot_id)
902 reference_sequences[reference["uniprot"]] = reference["sequence"]
903 # Save the current whole reference object for later
904 references[reference["uniprot"]] = reference
906 # Try to match all protein sequences with the available reference sequences
907 # In case of match, objects in the 'protein_sequences' list are modified by adding the result
908 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not
909 def match_sequences() -> bool:
910 # Track each chain-reference alignment match and keep the score of successful alignments
911 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)
912 for structure_sequence in protein_sequences:
913 for uniprot_id, reference_sequence in reference_sequences.items():
914 # Align the structure sequence with the reference sequence
915 align_results = align(
916 reference_sequence, structure_sequence["sequence"]
917 )
918 if not align_results:
919 continue
920 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)
921 sequence_map, align_score = align_results
922 current_reference = structure_sequence["match"]
923 if current_reference["score"] > align_score:
924 continue
925 reference = references[uniprot_id]
926 # If the alignment is better then we impose the new reference
927 structure_sequence["match"] = {
928 "ref": reference,
929 "map": sequence_map,
930 "score": align_score,
931 }
932 # Sum up the current matching
933 print("Reference summary:")
934 for structure_sequence in structure_sequences:
935 name = structure_sequence["name"]
936 match = structure_sequence.get("match", None)
937 if not match:
938 print(" " + name + " -> Not protein")
939 continue
940 reference = structure_sequence["match"].get("ref", None)
941 if not reference:
942 print(" " + name + " -> ¿?")
943 continue
944 uniprot_id = reference["uniprot"]
945 print(" " + name + " -> " + uniprot_id)
946 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not
947 return all(
948 [
949 structure_sequence["match"]["ref"]
950 for structure_sequence in protein_sequences
951 ]
952 )
954 # If we have every protein chain matched with a reference already then we stop here
955 if match_sequences():
956 return format_topology_data(structure, protein_sequences)
957 # If there are still any chain which is not matched with a reference then we need more references
958 # To get them, we run a blast with each orphan chain sequence
959 for structure_sequence in protein_sequences:
960 # Skip already references chains
961 if structure_sequence["match"]["ref"]:
962 continue
963 # Run the blast
964 sequence = structure_sequence["sequence"]
965 # uniprot_id = blast(sequence)[0]
966 uniprot_id = blast(sequence) # type: ignore
967 # Build a new reference from the resulting uniprot
968 reference = get_uniprot_reference(uniprot_id) # type: ignore
969 reference_sequences[reference["uniprot"]] = reference["sequence"]
970 # Save the current whole reference object for later
971 references[reference["uniprot"]] = reference
972 # If we have every protein chain matched with a reference already then we stop here
973 if match_sequences():
974 return format_topology_data(structure, protein_sequences)
975 print(
976 "The BLAST failed to find a matching reference sequence for at least one protein sequence"
977 )
978 return None
981# Try to match all protein sequences with the available reference sequences
982# In case of match, objects in the 'protein_sequences' list are modified by adding the result
983# Finally, return True if all protein sequences were matched with the available reference sequences or False if not
984# def match_sequences(protein_sequences: list, reference_sequences: dict) -> bool:
985# # Track each chain-reference alignment match and keep the score of successful alignments
986# # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)
987# for structure_sequence in protein_sequences:
988# for uniprot_id, reference_sequence in reference_sequences.items():
989# # Align the structure sequence with the reference sequence
990# align_results = align(reference_sequence, structure_sequence['sequence'])
991# if not align_results:
992# continue
993# # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)
994# sequence_map, align_score = align_results
995# current_reference = structure_sequence['match']
996# if current_reference['score'] > align_score:
997# continue
998# reference = references[uniprot_id] # type: ignore
1000# # If the alignment is better then we impose the new reference
1001# structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score}
1002# # Finally, return True if all protein sequences were matched with the available reference sequences or False if not
1003# return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences])
1006# Reformat mapping data to the topology system(introduced later)
1007def format_topology_data(structure: "Structure", mapping_data: list) -> dict:
1008 # Get the count of residues from the structure
1009 residues_count = len(structure.residues)
1010 # Now format data
1011 reference_ids = []
1012 residue_reference_indices: list = [None] * residues_count
1013 residue_reference_numbers = [None] * residues_count
1014 for data in mapping_data:
1015 match = data["match"]
1016 # Get the reference index
1017 # Note that several matches may belong to the same reference and thus have the same index
1018 reference = match["ref"]
1019 uniprot_id = reference["uniprot"]
1020 if uniprot_id not in reference_ids:
1021 reference_ids.append(reference["uniprot"])
1022 reference_index = reference_ids.index(uniprot_id)
1023 residue_indices = data["residue_indices"]
1024 for r, residue_number in enumerate(match["map"]):
1025 if residue_number is None:
1026 continue
1027 residue_index = residue_indices[r]
1028 residue_reference_indices[residue_index] = reference_index
1029 residue_reference_numbers[residue_index] = residue_number
1030 # If there are not references at the end then set all fields as None, in order to save space
1031 if len(reference_ids) == 0:
1032 reference_ids = []
1033 residue_reference_indices = []
1034 residue_reference_numbers = []
1035 # Return the 3 topology fields as they are in the database
1036 residues_map = {
1037 "references": reference_ids,
1038 "residue_reference_indices": residue_reference_indices,
1039 "residue_reference_numbers": residue_reference_numbers,
1040 }
1041 return residues_map
1044# Align a reference aminoacid sequence with each chain sequence in a topology
1045# NEVER FORGET: This system relies on the fact that topology chains are not repeated
1046def map_sequence(ref_sequence: str, structure: "Structure") -> list:
1047 sequences = get_chain_sequences(structure)
1048 mapping: list = []
1049 for s in sequences:
1050 sequence = sequences[s]
1051 sequence_map = align(ref_sequence, sequence)
1052 if sequence_map:
1053 mapping += sequence_map
1054 return mapping
1057# Get each chain name and aminoacids sequence in a topology
1058# Output format example: [{'sequence': 'VNLTT', 'indices': [1, 2, 3, 4, 5]}, ...]
1059def get_chain_sequences(structure: "Structure") -> list:
1060 sequences = []
1061 chains = structure.chains
1062 for chain in chains:
1063 name = chain.name
1064 sequence = ""
1065 residue_indices = []
1066 for residue in chain.residues:
1067 letter = residue_name_2_letter(residue.name, "aminoacids")
1068 sequence += letter
1069 residue_indices.append(residue.index)
1070 # Save sequences by chain name (i.e. chain id or chain letter)
1071 sequence_object = {
1072 "name": name,
1073 "sequence": sequence,
1074 "residue_indices": residue_indices,
1075 }
1076 sequences.append(sequence_object)
1077 return sequences
1080# Align two aminoacid sequences
1081# Return a list with the reference residue indexes (values)
1082# which match each new sequence residues indexes (indexes)
1083# Return also the score of the alignment
1084# Return None when there is not valid alignment at all
1085def align(ref_sequence: str, new_sequence: str) -> Optional[tuple[list, float]]:
1086 # print('- REFERENCE\n' + ref_sequence + '\n- NEW\n' + new_sequence)
1088 # If the new sequence is all 'X' stop here, since this would make the alignment infinite
1089 # Then an array filled with None is returned
1090 if all([letter == "X" for letter in new_sequence]):
1091 return None
1093 # Return the new sequence as best aligned as possible with the reference sequence
1094 alignments = pairwise2.align.localds( # type: ignore
1095 ref_sequence, new_sequence, blosum62, -10, -0.5
1096 )
1097 # DANI: Habría que hacerlo de esta otra forma según el deprecation warning (arriba hay más código)
1098 # DANI: El problema es que el output lo tiene todo menos la sequencia en formato alienada
1099 # DANI: i.e. formato '----VNLTT', que es justo el que necesito
1100 # alignments = aligner.align(ref_sequence, new_sequence)
1102 # In case there are no alignments it means the current chain has nothing to do with this reference
1103 # Then an array filled with None is returned
1104 if len(alignments) == 0:
1105 return None
1107 # Several alignments may be returned, specially when it is a difficult or impossible alignment
1108 # Output format example: '----VNLTT'
1109 best_alignment = alignments[0]
1110 aligned_sequence = best_alignment[1]
1111 print(format_alignment(*alignments[0]))
1112 score = alignments[0][2]
1113 # WARNING: Do not use 'aligned_sequence' length here since it has the total sequence length
1114 normalized_score = score / len(new_sequence)
1115 print("Normalized score: " + str(normalized_score))
1117 # If the normalized score does not reaches the minimum we consider the alignment is not valid
1118 # It may happen when the reference goes for a specific chain but we must map all chains
1119 # This 1 has been found experimentally
1120 # Non maching sequence may return a 0.1-0.3 normalized score
1121 # Matching sequence may return >4 normalized score
1122 if normalized_score < 1:
1123 print("Not valid alignment")
1124 return None
1126 # Match each residue
1127 aligned_mapping: list = []
1128 aligned_index = 0
1129 for l_index, letter in enumerate(aligned_sequence):
1130 # Guions are skipped
1131 if letter == "-":
1132 continue
1133 # Get the current residue of the new sequence
1134 equivalent_letter = new_sequence[aligned_index]
1135 if not letter == equivalent_letter:
1136 raise SystemExit("Something was wrong:S")
1137 # 'X' residues cannot be mapped since reference sequences should never have any 'X'
1138 if letter == "X":
1139 aligned_mapping.append(None)
1140 aligned_index += 1
1141 continue
1142 # Otherwise add the equivalent aligned index to the mapping
1143 # WARNING: Add +1 since uniprot residue counts start at 1, not 0
1144 aligned_mapping.append(l_index + 1)
1145 aligned_index += 1
1147 return aligned_mapping, normalized_score
1150# Given an aminoacids sequence, return a list of uniprot ids
1151# Note that we are blasting against UniProtKB / Swiss-Prot so results will always be valid UniProt accessions
1152# WARNING: This always means results will correspond to curated entries only
1153# If your sequence is from an exotic organism the result may be not from it but from other more studied organism
1154def blast(sequence: str) -> list[str]:
1155 print("Throwing blast... (this may take a few minutes)")
1156 result = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence)
1157 parsed_result = xmltodict.parse(result.read())
1158 hits = parsed_result["BlastOutput"]["BlastOutput_iterations"]["Iteration"][
1159 "Iteration_hits"
1160 ]["Hit"]
1161 # Get the first result only
1162 result = hits[0]
1163 # Return the accession
1164 # DANI: Si algun día tienes problemas porque te falta el '.1' al final del accession puedes sacarlo de Hit_id
1165 accession = result["Hit_accession"]
1166 print("Result: " + accession)
1167 print(result["Hit_def"])
1168 return accession
1171# Given a uniprot accession, use the uniprot API to request its data and then mine what is needed for the database
1172def get_uniprot_reference(uniprot_accession: str) -> dict:
1173 # Request Uniprot
1174 request_url = "https://www.ebi.ac.uk/proteins/api/proteins/" + uniprot_accession
1175 try:
1176 with urllib.request.urlopen(request_url) as response:
1177 parsed_response = json.loads(response.read().decode("utf-8"))
1178 # If the accession is not found in UniProt then the id is not valid
1179 except urllib.error.HTTPError as error: # type: ignore
1180 if error.code == 400:
1181 raise ValueError(
1182 "Something went wrong with the Uniprot request: " + request_url
1183 )
1184 # Get the aminoacids sequence
1185 sequence = parsed_response["sequence"]["sequence"]
1186 return {"uniprot": uniprot_accession, "sequence": sequence}