Coverage for biobb_model/model/fix_pdb_utils.py: 63%
542 statements
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-13 17:26 +0000
« prev ^ index » next coverage.py v7.4.3, created at 2024-03-13 17:26 +0000
1import os
2import urllib.request
3import json
4import math
5from bisect import bisect
6from Bio import pairwise2
7from Bio.pairwise2 import format_alignment
8from Bio.Blast import NCBIWWW
9import xmltodict
10from typing import List, Tuple, Optional, Union
11Coords = Tuple[float, float, float]
13try:
14 from Bio.SubsMat import MatrixInfo
15 blosum62 = MatrixInfo.blosum62
16except ImportError:
17 from Bio.Align import substitution_matrices
18 blosum62 = substitution_matrices.load("BLOSUM62")
21# An atom
22class Atom:
23 def __init__(self, name: Optional[str] = None, element: Optional[str] = None, coords: Optional[Coords] = None):
24 self.name = name
25 self.element = element
26 self.coords = coords
27 # Set variables to store references to other related instances
28 # These variables will be set further by the structure
29 self._structure = None
30 self._index = None
31 self._residue_index = None
33 def __repr__(self):
34 return '<Atom ' + self.name + '>'
36 def __eq__(self, other):
37 return self._residue_index == other._residue_index and self.name == other.name
39 # The parent structure (read only)
40 # This value is set by the structure itself
41 def get_structure(self):
42 return self._structure
43 structure = property(get_structure, None, None, "The parent structure (read only)")
45 # The residue index according to parent structure residues (read only)
46 # This value is set by the structure itself
47 def get_index(self):
48 return self._index
49 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")
51 # The atom residue index according to parent structure residues
52 # If residue index is set then make changes in all the structure to make this change coherent
53 def get_residue_index(self) -> int:
54 return self._residue_index
56 def set_residue_index(self, new_residue_index: int):
57 # If there is not strucutre yet it means the residue is beeing set before the structure
58 # We just save the residue index and wait for the structure to be set
59 if not self.structure:
60 self._residue_index = new_residue_index
61 return
62 # Relational indices are updated through a top-down hierarchy
63 # Affected residues are the ones to update this atom internal residue index
64 current_residue = self.residue
65 current_residue.remove_atom(self)
66 new_residue = self.structure.residues[new_residue_index]
67 new_residue.add_atom(self)
68 residue_index = property(get_residue_index, set_residue_index, None, "The atom residue index according to parent structure residues")
70 # The atom residue
71 # If residue is set then make changes in all the structure to make this change coherent
72 def get_residue(self) -> 'Residue':
73 # If there is not strucutre yet it means the atom is beeing set before the structure
74 # In this case it is not possible to get related residues in the structure
75 if not self.structure:
76 return None
77 # Get the residue in the structure according to the residue index
78 return self.structure.residues[self.residue_index]
80 def set_residue(self, new_residue: 'Residue'):
81 # Find the new residue index and set it as the atom residue index
82 # Note that the residue must be set in the structure already
83 new_residue_index = new_residue.index
84 if new_residue_index is None:
85 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')
86 self.set_residue_index(new_residue_index)
87 residue = property(get_residue, set_residue, None, "The atom residue")
89 # The atom chain index according to parent structure chains (read only)
90 # In order to change the chain index it must be changed in the atom residue
91 def get_chain_index(self) -> int:
92 return self.residue.chain_index
93 chain_index = property(get_chain_index, None, None, "The atom chain index according to parent structure chains (read only)")
95 # The atom chain(read only)
96 # In order to change the chain it must be changed in the atom residue
97 def get_chain(self) -> 'Chain':
98 # Get the chain in the structure according to the chain index
99 return self.structure.chains[self.chain_index]
100 chain = property(get_chain, None, None, "The atom chain (read only)")
103# A residue
104class Residue:
105 def __init__(self, name: Optional[str] = None, number: Optional[int] = None, icode: Optional[str] = None):
106 self.name = name
107 self.number = number
108 self.icode = icode
109 # Set variables to store references to other related instaces
110 # These variables will be set further by the structure
111 self._structure = None
112 self._index = None
113 self._atom_indices = []
114 self._chain_index = None
116 def __repr__(self):
117 return '<Residue ' + self.name + str(self.number) + (self.icode if self.icode else '') + '>'
119 def __eq__(self, other):
120 if type(self) != type(other):
121 return False
122 return (self._chain_index == other._chain_index and self.number == other.number and self.icode == other.icode)
124 def __hash__(self):
125 # WARNING: This is susceptible to duplicated residues
126 return hash((self._chain_index, self.number, self.icode))
127 # WARNING: This is not susceptible to duplicated residues
128 # return hash(tuple(self._atom_indices))
130 def same_inputs_as(self, other) -> bool:
131 return (
132 self.name == other.name and
133 self.number == other.number and
134 self.icode == other.icode
135 )
137 # The parent structure (read only)
138 # This value is set by the structure itself
139 def get_structure(self):
140 return self._structure
141 structure = property(get_structure, None, None, "The parent structure (read only)")
143 # The residue index according to parent structure residues (read only)
144 # This value is set by the structure itself
145 def get_index(self):
146 return self._index
147 index = property(get_index, None, None, "The residue index according to parent structure residues(read only)")
149 # The atom indices according to parent structure atoms for atoms in this residue
150 # If atom indices are set then make changes in all the structure to make this change coherent
151 def get_atom_indices(self) -> List[int]:
152 return self._atom_indices
154 def set_atom_indices(self, new_atom_indices: List[int]):
155 # If there is not strucutre yet it means the residue is beeing set before the structure
156 # We just save atom indices and wait for the structure to be set
157 if not self.structure:
158 self._atom_indices = new_atom_indices
159 return
160 # Update the current atoms
161 for atom in self.atoms:
162 atom._residue_index = None
163 # Update the new atoms
164 for index in new_atom_indices:
165 atom = self.structure.atoms[index]
166 atom._residue_index = self.index
167 # Now new indices are coherent and thus we can save them
168 self._atom_indices = new_atom_indices
169 atom_indices = property(get_atom_indices, set_atom_indices, None, "The atom indices according to parent structure atoms for atoms in this residue")
171 # The atoms in this residue
172 # If atoms are set then make changes in all the structure to make this change coherent
173 def get_atoms(self) -> List['Atom']:
174 # If there is not strucutre yet it means the chain is beeing set before the structure
175 # In this case it is not possible to get related atoms in the structure
176 if not self.structure:
177 return []
178 # Get atoms in the structure according to atom indices
179 atoms = self.structure.atoms
180 return [atoms[atom_index] for atom_index in self.atom_indices]
182 def set_atoms(self, new_atoms: List['Atom']):
183 # Find indices for new atoms and set their indices as the new atom indices
184 # Note that atoms must be set in the structure already
185 new_atom_indices = []
186 for new_atom in new_atoms:
187 new_atom_index = new_atom.index
188 if new_atom_index is None:
189 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')
190 new_atom_indices.append(new_atom_index)
191 self.set_atom_indices(new_atom_indices)
192 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue")
194 # Add an atom to the residue
195 def add_atom(self, new_atom: 'Atom'):
196 # Insert the new atom index in the list of atom indices keeping the order
197 new_atom_index = new_atom.index
198 sorted_atom_index = bisect(self.atom_indices, new_atom_index)
199 self.atom_indices.insert(sorted_atom_index, new_atom_index)
200 # Update the atom internal index
201 new_atom._residue_index = self.index
203 # Remove an atom from the residue
204 def remove_atom(self, current_atom: 'Atom'):
205 # Remove the current atom index from the atom indices list
206 self.atom_indices.remove(current_atom.index) # This index MUST be in the list
207 # Update the atom internal index
208 current_atom._residue_index = None
210 # The residue chain index according to parent structure chains
211 # If chain index is set then make changes in all the structure to make this change coherent
212 def get_chain_index(self) -> int:
213 return self._chain_index
215 def set_chain_index(self, new_chain_index: int):
216 # If there is not strucutre yet it means the chain is beeing set before the structure
217 # We just save the chain index and wait for the structure to be set
218 if not self.structure:
219 self._chain_index = new_chain_index
220 return
221 # Relational indices are updated through a top-down hierarchy
222 # Affected chains are the ones to update this residue internal chain index
223 # WARNING: It is critical to find both current and new chains before removing/adding residues
224 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged
225 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed
226 current_chain = self.chain
227 new_chain = self.structure.chains[new_chain_index]
228 current_chain.remove_residue(self)
229 new_chain.add_residue(self)
230 chain_index = property(get_chain_index, set_chain_index, None, "The residue chain index according to parent structure chains")
232 # The residue chain
233 # If chain is set then make changes in all the structure to make this change coherent
234 def get_chain(self) -> 'Chain':
235 # If there is not strucutre yet it means the residue is beeing set before the structure
236 # In this case it is not possible to get related chain in the structure
237 if not self.structure:
238 return []
239 # Get the chain in the structure according to the chain index
240 return self.structure.chains[self.chain_index]
242 def set_chain(self, new_chain: Union['Chain', str]):
243 # In case the chain is just a string we must find/create the corresponding chain
244 if type(new_chain) == str:
245 letter = new_chain
246 # Get the residue structure
247 structure = self.structure
248 if not structure:
249 raise ValueError('Cannot find the corresponding ' + new_chain + ' chain without the structure')
250 # Find if the letter belongs to an already existing chain
251 new_chain = structure.get_chain_by_name(letter)
252 # If the chain does not exist yet then create it
253 if not new_chain:
254 new_chain = Chain(name=letter)
255 structure.set_new_chain(new_chain)
256 # Find the new chain index and set it as the residue chain index
257 # Note that the chain must be set in the structure already
258 new_chain_index = new_chain.index
259 if new_chain_index is None:
260 raise ValueError('Chain ' + str(new_chain) + ' is not set in the structure')
261 self.set_chain_index(new_chain_index)
262 chain = property(get_chain, set_chain, None, "The residue chain")
265# A chain
266class Chain:
267 def __init__(self, name: Optional[str] = None):
268 self.name = name
269 # Set variables to store references to other related instaces
270 # These variables will be set further by the structure
271 self._structure = None
272 self._index = None
273 self.residue_indices = []
275 def __repr__(self):
276 return '<Chain ' + self.name + '>'
278 def __eq__(self, other):
279 return self.name == other.name
281 # The parent structure(read only)
282 # This value is set by the structure itself
283 def get_structure(self):
284 return self._structure
285 structure = property(get_structure, None, None, "The parent structure(read only)")
287 # The residue index according to parent structure residues(read only)
288 # This value is set by the structure itself
289 def get_index(self):
290 return self._index
292 # When the index is set all residues are updated with the nex chain index
293 def set_index(self, index: int):
294 for residue in self.residues:
295 residue._chain_index = index
296 self._index = index
297 index = property(get_index, set_index, None, "The residue index according to parent structure residues(read only)")
299 # The residue indices according to parent structure residues for residues in this chain
300 # If residue indices are set then make changes in all the structure to make this change coherent
301 def get_residue_indices(self) -> List[int]:
302 return self._residue_indices
304 def set_residue_indices(self, new_residue_indices: List[int]):
305 # If there is not strucutre yet it means the chain is beeing set before the structure
306 # We just save residue indices and wait for the structure to be set
307 if not self.structure:
308 self._residue_indices = new_residue_indices
309 return
310 # Update the current residues
311 for residue in self.residues:
312 residue._chain_index = None
313 # Update the new residues
314 for index in new_residue_indices:
315 residue = self.structure.residues[index]
316 residue._chain_index = self.index
317 # In case the new residue indices list is empty this chain must be removed from its structure
318 if len(new_residue_indices) == 0:
319 self.structure.purge_chain(self)
320 # Now new indices are coherent and thus we can save them
321 self._residue_indices = new_residue_indices
322 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")
324 # The residues in this chain
325 # If residues are set then make changes in all the structure to make this change coherent
326 def get_residues(self) -> List['Residue']:
327 # If there is not strucutre yet it means the chain is beeing set before the structure
328 # In this case it is not possible to get related residues in the structure
329 if not self.structure:
330 return []
331 # Get residues in the structure according to residue indices
332 residues = self.structure.residues
333 return [residues[residue_index] for residue_index in self.residue_indices]
335 def set_residues(self, new_residues: List['Residue']):
336 # Find indices for new residues and set their indices as the new residue indices
337 # Note that residues must be set in the structure already
338 new_residue_indices = []
339 for new_residue in new_residues:
340 new_residue_index = new_residue.index
341 if new_residue_index is None:
342 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')
343 new_residue_indices.append(new_residue_index)
344 self.set_residue_indices(new_residue_indices)
345 residues = property(get_residues, set_residues, None, "The residues in this chain")
347 # Add a residue to the chain
348 def add_residue(self, residue: 'Residue'):
349 # Insert the new residue index in the list of residue indices keeping the order
350 sorted_residue_index = bisect(self.residue_indices, residue.index)
351 self.residue_indices.insert(sorted_residue_index, residue.index)
352 # Update the residue internal chain index
353 residue._chain_index = self.index
355 # Remove a residue from the chain
356 # WARNING: Note that this function does not trigger the set_residue_indices
357 def remove_residue(self, residue: 'Residue'):
358 self.residue_indices.remove(residue.index) # This index MUST be in the list
359 # If we removed the last index then this chain must be removed from its structure
360 if len(self.residue_indices) == 0 and self.structure:
361 self.structure.purge_chain(self)
362 # Update the residue internal chain index
363 residue._chain_index = None
365 # Atom indices for all atoms in the chain(read only)
366 # In order to change atom indices they must be changed in their corresponding residues
367 def get_atom_indices(self) -> List[int]:
368 return sum([residue.atom_indices for residue in self.residues], [])
369 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain(read only)")
371 # Atoms in the chain(read only)
372 # In order to change atoms they must be changed in their corresponding residues
373 def get_atoms(self) -> List[int]:
374 return sum([residue.atoms for residue in self.residues], [])
375 atoms = property(get_atoms, None, None, "Atoms in the chain(read only)")
377 # Get the residues sequence in one-letter code
378 def get_sequence(self) -> str:
379 return ''.join([residue_name_2_letter(residue.name) for residue in self.residues])
382# A structure is a group of atoms organized in chains and residues
383class Structure:
384 def __init__(self, atoms: List['Atom'] = [], residues: List['Residue'] = [], chains: List['Chain'] = []):
385 self.atoms = []
386 self.residues = []
387 self.chains = []
388 # Set references between instances
389 for atom in atoms:
390 self.set_new_atom(atom)
391 for residue in residues:
392 self.set_new_residue(residue)
393 for chain in chains:
394 self.set_new_chain(chain)
396 def __repr__(self):
397 return '<Structure(' + str(len(self.atoms)) + ' atoms)>'
399 # Set a new atom in the structure
400 def set_new_atom(self, atom: 'Atom'):
401 atom._structure = self
402 new_atom_index = len(self.atoms)
403 self.atoms.append(atom)
404 atom._index = new_atom_index
406 # Set a new residue in the structure
407 # WARNING: Atoms must be set already before setting residues
408 def set_new_residue(self, residue: 'Residue'):
409 residue._structure = self
410 new_residue_index = len(self.residues)
411 self.residues.append(residue)
412 residue._index = new_residue_index
413 # In case the residue has atom indices, set relational indices on each atom
414 for atom_index in residue.atom_indices:
415 atom = self.atoms[atom_index]
416 atom._residue_index = new_residue_index
418 # Set a new chain in the structure
419 # WARNING: Residues and atoms must be set already before setting chains
420 def set_new_chain(self, chain: 'Chain'):
421 chain._structure = self
422 new_chain_index = len(self.chains)
423 self.chains.append(chain)
424 chain._index = new_chain_index
425 # In case the chain has residue indices, set relational indices on each residue
426 for residue_index in chain.residue_indices:
427 residue = self.residues[residue_index]
428 residue._chain_index = new_chain_index
430 # Purge chain from the structure
431 # This can be done only when the chain has no residues left in the structure
432 # Renumerate all chain indices which have been offsetted as a result of the purge
433 def purge_chain(self, chain: 'Chain'):
434 # Check the chain can be purged
435 if chain not in self.chains:
436 raise ValueError('Chain ' + chain.name + ' is not in the structure already')
437 if len(chain.residue_indices) > 0:
438 raise ValueError('Chain ' + chain.name + ' is still having residues and thus it cannot be purged')
439 # Get the current index of the chain to be purged
440 purged_index = chain.index
441 # Chains and their residues below this index are not to be modified
442 # Chains and their residues over this index must be renumerated
443 for affected_chain in self.chains[purged_index+1:]:
444 # Chaging the index automatically changes all chain residues '_chain_index' values.
445 affected_chain.index -= 1
446 # Finally, remove the current chain from the list of chains in the structure
447 self.chains.remove(chain)
449 # Set the structure from a pdb file
450 @classmethod
451 def from_pdb_file(cls, pdb_filename: str):
452 if not os.path.exists(pdb_filename):
453 raise SystemExit('File "' + pdb_filename + '" not found')
454 # Read the pdb file line by line and set the parsed atoms, residues and chains
455 parsed_atoms = []
456 parsed_residues = []
457 parsed_chains = []
458 atom_index = -1
459 residue_index = -1
460 with open(pdb_filename, 'r') as file:
461 for line in file:
462 # Parse atoms only
463 start = line[0:6]
464 is_atom = start == 'ATOM ' or start == 'HETATM'
465 if not is_atom:
466 continue
467 # Mine all atom data
468 atom_name = line[11:16].strip()
469 residue_name = line[17:21].strip()
470 chain = line[21:22]
471 residue_number = int(line[22:26])
472 icode = line[26:27]
473 if icode == ' ':
474 icode = ''
475 x_coord = float(line[30:38])
476 y_coord = float(line[38:46])
477 z_coord = float(line[46:54])
478 element = line[77:79].strip()
479 # Set the parsed atom, residue and chain
480 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
481 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)
482 parsed_chain = Chain(name=chain)
483 # Add the parsed atom to the list and update the current atom index
484 parsed_atoms.append(parsed_atom)
485 atom_index += 1
486 # Check if we are in the same chain/residue than before
487 same_chain = parsed_chains and parsed_chains[-1] == parsed_chain
488 same_residue = same_chain and parsed_residues and parsed_residue.same_inputs_as(parsed_residues[-1])
489 # Update the residue atom indices
490 # If the residue equals the last parsed residue then use the previous instead
491 if same_residue:
492 parsed_residue = parsed_residues[-1]
493 parsed_residue.atom_indices.append(atom_index)
494 # If it is the same residue then it will be the same chain as well so we can proceed
495 continue
496 # Otherwise, include the new residue in the list and update the current residue index
497 parsed_residues.append(parsed_residue)
498 residue_index += 1
499 parsed_residue.atom_indices.append(atom_index)
500 # If the chain equals the last parsed chain then use the previous instead
501 if same_chain:
502 parsed_chain = parsed_chains[-1]
503 parsed_chain.residue_indices.append(residue_index)
504 continue
505 # Otherwise, include the new chain in the list
506 parsed_chains.append(parsed_chain)
507 parsed_chain.residue_indices.append(residue_index)
508 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
510 # Fix atom elements by gueesing them when missing
511 # Set all elements with the first letter upper and the second(if any) lower
512 def fix_atom_elements(self):
513 for atom in self.atoms:
514 # Make sure elements have the first letter cap and the second letter not cap
515 if atom.element:
516 atom.element = first_cap_only(atom.element)
517 # If elements are missing then guess them from atom names
518 else:
519 atom.element = guess_name_element(atom.name)
521 # Generate a pdb file with current structure
522 def generate_pdb_file(self, pdb_filename: str):
523 with open(pdb_filename, "w") as file:
524 file.write('REMARK mdtoolbelt generated pdb file\n')
525 for a, atom in enumerate(self.atoms):
526 residue = atom.residue
527 index = str((a+1) % 100000).rjust(5)
528 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name
529 residue_name = residue.name.ljust(4)
530 chain = atom.chain.name.rjust(1)
531 residue_number = str(residue.number).rjust(4)
532 icode = residue.icode.rjust(1)
533 coords = atom.coords
534 x_coord, y_coord, z_coord = ["{:.3f}".format(coord).rjust(8) for coord in coords]
535 occupancy = '1.00' # Just a placeholder
536 temp_factor = '0.00' # Just a placeholder
537 element = atom.element
538 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name + chain + residue_number + icode + ' ' + x_coord + y_coord + z_coord + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'
539 file.write(atom_line)
541 # Get a chain by its name
542 def get_chain_by_name(self, name: str) -> 'Chain':
543 return next((c for c in self.chains if c.name == name), None)
545 # Get a summary of the structure
546 def display_summary(self):
547 print('Atoms: ' + str(len(self.atoms)))
548 print('Residues: ' + str(len(self.residues)))
549 print('Chains: ' + str(len(self.chains)))
550 for chain in self.chains:
551 print('Chain ' + chain.name + ' (' + str(len(chain.residue_indices)) + ' residues)')
552 print(' -> ' + chain.get_sequence())
554 # This is an alternative system to find protein chains(anything else is chained as 'X')
555 # This system does not depend on VMD
556 # It totally overrides previous chains since it is expected to be used only when chains are missing
557 def raw_protein_chainer(self):
558 current_chain = self.get_next_available_chain_name()
559 previous_alpha_carbon = None
560 for residue in self.residues:
561 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)
562 if not alpha_carbon:
563 residue.set_chain('X')
564 continue
565 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms
566 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4
567 if not residues_are_connected:
568 current_chain = self.get_next_available_chain_name()
569 residue.set_chain(current_chain)
570 previous_alpha_carbon = alpha_carbon
572 # Get the next available chain name
573 # Find alphabetically the first letter which is not yet used as a chain name
574 # If all letters in the alphabet are used already then return None
575 def get_next_available_chain_name(self) -> Optional[str]:
576 current_chain_names = [chain.name for chain in self.chains]
577 return next((name for name in available_caps if name not in current_chain_names), None)
580# Calculate the distance between two atoms
581def calculate_distance(atom_1: Atom, atom_2: Atom) -> float:
582 squared_distances_sum = 0
583 for i in {'x': 0, 'y': 1, 'z': 2}.values():
584 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2
585 return math.sqrt(squared_distances_sum)
588# Set all available chains according to pdb standards
589available_caps = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
590 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
592# Protein residues
593aminoacids = {
594 "ALA": "A",
595 "ALAN": "A",
596 "ALAC": "A",
597 "ARG": "R",
598 "ARGN": "R",
599 "ARGC": "R",
600 "ASN": "N",
601 "ASNN": "N",
602 "ASNC": "N",
603 "ASP": "D",
604 "ASPN": "D",
605 "ASPC": "D",
606 "CYS": "C",
607 "CYSN": "C",
608 "CYSC": "C",
609 "CYH": "C",
610 "CSH": "C",
611 "CSS": "C",
612 "CYX": "C",
613 "CYP": "C",
614 "GLN": "Q",
615 "GLNN": "Q",
616 "GLNC": "Q",
617 "GLU": "E",
618 "GLUN": "E",
619 "GLUC": "E",
620 "GLY": "G",
621 "GLYN": "G",
622 "GLYC": "G",
623 "HIS": "H",
624 "HISN": "H",
625 "HISC": "H",
626 "HID": "H",
627 "HIE": "H",
628 "HIP": "H",
629 "HSD": "H",
630 "HSE": "H",
631 "ILE": "I",
632 "ILEN": "I",
633 "ILEC": "I",
634 "ILU": "I",
635 "LEU": "L",
636 "LEUN": "L",
637 "LEUC": "L",
638 "LYS": "K",
639 "LYSN": "K",
640 "LYSC": "K",
641 "MET": "M",
642 "METN": "M",
643 "METC": "M",
644 "PHE": "F",
645 "PHEN": "F",
646 "PHEC": "F",
647 "PRO": "P",
648 "PRON": "P",
649 "PROC": "P",
650 "PRØ": "P",
651 "PR0": "P",
652 "PRZ": "P",
653 "SER": "S",
654 "SERN": "S",
655 "SERC": "S",
656 "THR": "T",
657 "THRN": "T",
658 "THRC": "R",
659 "TRP": "W",
660 "TRPN": "W",
661 "TRPC": "W",
662 "TRY": "W",
663 "TYR": "Y",
664 "TYRN": "Y",
665 "TYRC": "Y",
666 "VAL": "V",
667 "VALN": "V",
668 "VALC": "V"
669}
671# Nucleic acid residues
672nucleotides = {
673 "A": "A",
674 "A3": "A",
675 "A5": "A",
676 "C": "C",
677 "C3": "C",
678 "C5": "C",
679 "T": "T",
680 "T3": "T",
681 "T5": "T",
682 "G": "G",
683 "G3": "G",
684 "G5": "G",
685 "U": "U",
686 "U3": "U",
687 "U5": "U",
688 "DA": "A",
689 "DT": "T",
690 "DC": "C",
691 "DG": "G",
692}
694# All supported residues
695supported_residues = {**aminoacids, **nucleotides}
698# Transform a residue name to its equivalent single letter code
699# If the residue name is not recognized then return "X"
700# e.g. "ARG" -> "R", "WTF" -> "X"
701# You can choose which residue types are targeted(e.g. aminoacids only)
702# Options are: 'all', 'aminoacids' or 'nucleotides'
703# All residue types are targeted by default
704def residue_name_2_letter(residue_name: str, residue_types: str = "all") -> str:
705 # Set the target residues
706 if residue_types == "all":
707 target_residues = supported_residues
708 elif residue_types == "aminoacids":
709 target_residues = aminoacids
710 elif residue_types == "nucleotides":
711 target_residues = nucleotides
712 else:
713 raise ValueError('Unrecognized residue types ' + str(residue_types))
714 # Now find the corresponding letter among the target residues
715 ref = target_residues.get(residue_name, False)
716 return ref if ref else "X"
719# Map the structure aminoacids sequences against the standard reference sequences
720# References are uniprot accession ids and they are optional
721# For each reference, align the reference sequence with the topology sequence
722# Chains which do not match any reference sequence will be blasted
723# Note that an internet connection is required both to retireve the uniprot reference sequence and to do the blast
724# NEVER FORGET: This system relies on the fact that topology chains are not repeated
725def generate_map_online(structure: 'Structure', forced_references: List[str] = []) -> Optional[dict]:
726 # Store all the references which are got through this process
727 # Note that not all references may be used at the end
728 references = {}
729 # Get the structure chain sequences
730 structure_sequences = get_chain_sequences(structure)
731 # Find out which chains are protein
732 protein_sequences = []
733 for structure_sequence in structure_sequences:
734 sequence = structure_sequence['sequence']
735 if next((letter for letter in sequence if letter != 'X'), None):
736 structure_sequence['match'] = {'ref': None, 'map': None, 'score': 0}
737 protein_sequences.append(structure_sequence)
738 # For each input forced reference, get the reference sequence
739 reference_sequences = {}
740 if forced_references:
741 # Check forced_references is not a string
742 if type(forced_references) == str:
743 raise TypeError('Forced references should be a list and not a string a this point')
744 for uniprot_id in forced_references:
745 reference = get_uniprot_reference(uniprot_id)
746 reference_sequences[reference['uniprot']] = reference['sequence']
747 # Save the current whole reference object for later
748 references[reference['uniprot']] = reference
750 # Try to match all protein sequences with the available reference sequences
751 # In case of match, objects in the 'protein_sequences' list are modified by adding the result
752 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not
753 def match_sequences() -> bool:
754 # Track each chain-reference alignment match and keep the score of successful alignments
755 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)
756 for structure_sequence in protein_sequences:
757 for uniprot_id, reference_sequence in reference_sequences.items():
758 # Align the structure sequence with the reference sequence
759 align_results = align(reference_sequence, structure_sequence['sequence'])
760 if not align_results:
761 continue
762 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)
763 sequence_map, align_score = align_results
764 current_reference = structure_sequence['match']
765 if current_reference['score'] > align_score:
766 continue
767 reference = references[uniprot_id]
768 # If the alignment is better then we impose the new reference
769 structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score}
770 # Sum up the current matching
771 print('Reference summary:')
772 for structure_sequence in structure_sequences:
773 name = structure_sequence['name']
774 match = structure_sequence.get('match', None)
775 if not match:
776 print(' ' + name + ' -> Not protein')
777 continue
778 reference = structure_sequence['match'].get('ref', None)
779 if not reference:
780 print(' ' + name + ' -> ¿?')
781 continue
782 uniprot_id = reference['uniprot']
783 print(' ' + name + ' -> ' + uniprot_id)
784 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not
785 return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences])
786 # If we have every protein chain matched with a reference already then we stop here
787 if match_sequences():
788 return format_topology_data(structure, protein_sequences)
789 # If there are still any chain which is not matched with a reference then we need more references
790 # To get them, we run a blast with each orphan chain sequence
791 for structure_sequence in protein_sequences:
792 # Skip already references chains
793 if structure_sequence['match']['ref']:
794 continue
795 # Run the blast
796 sequence = structure_sequence['sequence']
797 uniprot_id = blast(sequence)
798 # Build a new reference from the resulting uniprot
799 reference = get_uniprot_reference(uniprot_id)
800 reference_sequences[reference['uniprot']] = reference['sequence']
801 # Save the current whole reference object for later
802 references[reference['uniprot']] = reference
803 # If we have every protein chain matched with a reference already then we stop here
804 if match_sequences():
805 return format_topology_data(structure, protein_sequences)
806 print('The BLAST failed to find a matching reference sequence for at least one protein sequence')
807 return None
810# Try to match all protein sequences with the available reference sequences
811# In case of match, objects in the 'protein_sequences' list are modified by adding the result
812# Finally, return True if all protein sequences were matched with the available reference sequences or False if not
813def match_sequences(protein_sequences: list, reference_sequences: dict) -> bool:
814 # Track each chain-reference alignment match and keep the score of successful alignments
815 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)
816 for structure_sequence in protein_sequences:
817 for uniprot_id, reference_sequence in reference_sequences.items():
818 # Align the structure sequence with the reference sequence
819 align_results = align(reference_sequence, structure_sequence['sequence'])
820 if not align_results:
821 continue
822 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)
823 sequence_map, align_score = align_results
824 current_reference = structure_sequence['match']
825 if current_reference['score'] > align_score:
826 continue
827 reference = references[uniprot_id]
829 # If the alignment is better then we impose the new reference
830 structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score}
831 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not
832 return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences])
835# Reformat mapping data to the topology system(introduced later)
836def format_topology_data(structure: 'Structure', mapping_data: list) -> dict:
837 # Get the count of residues from the structure
838 residues_count = len(structure.residues)
839 # Now format data
840 reference_ids = []
841 residue_reference_indices = [None] * residues_count
842 residue_reference_numbers = [None] * residues_count
843 for data in mapping_data:
844 match = data['match']
845 # Get the reference index
846 # Note that several matches may belong to the same reference and thus have the same index
847 reference = match['ref']
848 uniprot_id = reference['uniprot']
849 if uniprot_id not in reference_ids:
850 reference_ids.append(reference['uniprot'])
851 reference_index = reference_ids.index(uniprot_id)
852 residue_indices = data['residue_indices']
853 for r, residue_number in enumerate(match['map']):
854 if residue_number is None:
855 continue
856 residue_index = residue_indices[r]
857 residue_reference_indices[residue_index] = reference_index
858 residue_reference_numbers[residue_index] = residue_number
859 # If there are not references at the end then set all fields as None, in order to save space
860 if len(reference_ids) == 0:
861 reference_ids = None
862 residue_reference_indices = None
863 residue_reference_numbers = None
864 # Return the 3 topology fields as they are in the database
865 residues_map = {'references': reference_ids, 'residue_reference_indices': residue_reference_indices, 'residue_reference_numbers': residue_reference_numbers}
866 return residues_map
869# Align a reference aminoacid sequence with each chain sequence in a topology
870# NEVER FORGET: This system relies on the fact that topology chains are not repeated
871def map_sequence(ref_sequence: str, structure: 'Structure') -> list:
872 sequences = get_chain_sequences(structure)
873 mapping = []
874 for s in sequences:
875 sequence = sequences[s]
876 sequence_map = align(ref_sequence, sequence)
877 mapping += sequence_map
878 return mapping
881# Get each chain name and aminoacids sequence in a topology
882# Output format example: [{'sequence': 'VNLTT', 'indices': [1, 2, 3, 4, 5]}, ...]
883def get_chain_sequences(structure: 'Structure') -> list:
884 sequences = []
885 chains = structure.chains
886 for chain in chains:
887 name = chain.name
888 sequence = ''
889 residue_indices = []
890 for residue in chain.residues:
891 letter = residue_name_2_letter(residue.name, 'aminoacids')
892 sequence += letter
893 residue_indices.append(residue.index)
894 # Save sequences by chain name (i.e. chain id or chain letter)
895 sequence_object = {'name': name, 'sequence': sequence, 'residue_indices': residue_indices}
896 sequences.append(sequence_object)
897 return sequences
900# Align two aminoacid sequences
901# Return a list with the reference residue indexes (values)
902# which match each new sequence residues indexes (indexes)
903# Return also the score of the alignment
904# Return None when there is not valid alignment at all
905def align(ref_sequence: str, new_sequence: str) -> Optional[Tuple[list, float]]:
907 # print('- REFERENCE\n' + ref_sequence + '\n- NEW\n' + new_sequence)
909 # If the new sequence is all 'X' stop here, since this would make the alignment infinite
910 # Then an array filled with None is returned
911 if all([letter == 'X' for letter in new_sequence]):
912 return None
914 # Return the new sequence as best aligned as possible with the reference sequence
915 alignments = pairwise2.align.localds(ref_sequence, new_sequence, blosum62, -10, -0.5)
916 # DANI: Habría que hacerlo de esta otra forma según el deprecation warning (arriba hay más código)
917 # DANI: El problema es que el output lo tiene todo menos la sequencia en formato alienada
918 # DANI: i.e. formato '----VNLTT', que es justo el que necesito
919 # alignments = aligner.align(ref_sequence, new_sequence)
921 # In case there are no alignments it means the current chain has nothing to do with this reference
922 # Then an array filled with None is returned
923 if len(alignments) == 0:
924 return None
926 # Several alignments may be returned, specially when it is a difficult or impossible alignment
927 # Output format example: '----VNLTT'
928 best_alignment = alignments[0]
929 aligned_sequence = best_alignment[1]
930 print(format_alignment(*alignments[0]))
931 score = alignments[0][2]
932 # WARNING: Do not use 'aligned_sequence' length here since it has the total sequence length
933 normalized_score = score / len(new_sequence)
934 print('Normalized score: ' + str(normalized_score))
936 # If the normalized score does not reaches the minimum we consider the alignment is not valid
937 # It may happen when the reference goes for a specific chain but we must map all chains
938 # This 1 has been found experimentally
939 # Non maching sequence may return a 0.1-0.3 normalized score
940 # Matching sequence may return >4 normalized score
941 if normalized_score < 1:
942 print('Not valid alignment')
943 return None
945 # Match each residue
946 aligned_mapping = []
947 aligned_index = 0
948 for l_index, letter in enumerate(aligned_sequence):
949 # Guions are skipped
950 if letter == '-':
951 continue
952 # Get the current residue of the new sequence
953 equivalent_letter = new_sequence[aligned_index]
954 if not letter == equivalent_letter:
955 raise SystemExit('Something was wrong:S')
956 # 'X' residues cannot be mapped since reference sequences should never have any 'X'
957 if letter == 'X':
958 aligned_mapping.append(None)
959 aligned_index += 1
960 continue
961 # Otherwise add the equivalent aligned index to the mapping
962 # WARNING: Add +1 since uniprot residue counts start at 1, not 0
963 aligned_mapping.append(l_index + 1)
964 aligned_index += 1
966 return aligned_mapping, normalized_score
969# Given an aminoacids sequence, return a list of uniprot ids
970# Note that we are blasting against UniProtKB / Swiss-Prot so results will always be valid UniProt accessions
971# WARNING: This always means results will correspond to curated entries only
972# If your sequence is from an exotic organism the result may be not from it but from other more studied organism
973def blast(sequence: str) -> List[str]:
974 print('Throwing blast... (this may take a few minutes)')
975 result = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence)
976 parsed_result = xmltodict.parse(result.read())
977 hits = parsed_result['BlastOutput']['BlastOutput_iterations']['Iteration']['Iteration_hits']['Hit']
978 # Get the first result only
979 result = hits[0]
980 # Return the accession
981 # DANI: Si algun día tienes problemas porque te falta el '.1' al final del accession puedes sacarlo de Hit_id
982 accession = result['Hit_accession']
983 print('Result: ' + accession)
984 print(result['Hit_def'])
985 return accession
988# Given a uniprot accession, use the uniprot API to request its data and then mine what is needed for the database
989def get_uniprot_reference(uniprot_accession: str) -> dict:
990 # Request Uniprot
991 request_url = 'https://www.ebi.ac.uk/proteins/api/proteins/' + uniprot_accession
992 try:
993 with urllib.request.urlopen(request_url) as response:
994 parsed_response = json.loads(response.read().decode("utf-8"))
995 # If the accession is not found in UniProt then the id is not valid
996 except urllib.error.HTTPError as error:
997 if error.code == 400:
998 raise ValueError('Something went wrong with the Uniprot request: ' + request_url)
999 # Get the aminoacids sequence
1000 sequence = parsed_response['sequence']['sequence']
1001 return {'uniprot': uniprot_accession, 'sequence': sequence}