Coverage for biobb_model/model/fix_pdb_utils.py: 63%

542 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-13 17:26 +0000

1import os 

2import urllib.request 

3import json 

4import math 

5from bisect import bisect 

6from Bio import pairwise2 

7from Bio.pairwise2 import format_alignment 

8from Bio.Blast import NCBIWWW 

9import xmltodict 

10from typing import List, Tuple, Optional, Union 

11Coords = Tuple[float, float, float] 

12 

13try: 

14 from Bio.SubsMat import MatrixInfo 

15 blosum62 = MatrixInfo.blosum62 

16except ImportError: 

17 from Bio.Align import substitution_matrices 

18 blosum62 = substitution_matrices.load("BLOSUM62") 

19 

20 

21# An atom 

22class Atom: 

23 def __init__(self, name: Optional[str] = None, element: Optional[str] = None, coords: Optional[Coords] = None): 

24 self.name = name 

25 self.element = element 

26 self.coords = coords 

27 # Set variables to store references to other related instances 

28 # These variables will be set further by the structure 

29 self._structure = None 

30 self._index = None 

31 self._residue_index = None 

32 

33 def __repr__(self): 

34 return '<Atom ' + self.name + '>' 

35 

36 def __eq__(self, other): 

37 return self._residue_index == other._residue_index and self.name == other.name 

38 

39 # The parent structure (read only) 

40 # This value is set by the structure itself 

41 def get_structure(self): 

42 return self._structure 

43 structure = property(get_structure, None, None, "The parent structure (read only)") 

44 

45 # The residue index according to parent structure residues (read only) 

46 # This value is set by the structure itself 

47 def get_index(self): 

48 return self._index 

49 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)") 

50 

51 # The atom residue index according to parent structure residues 

52 # If residue index is set then make changes in all the structure to make this change coherent 

53 def get_residue_index(self) -> int: 

54 return self._residue_index 

55 

56 def set_residue_index(self, new_residue_index: int): 

57 # If there is not strucutre yet it means the residue is beeing set before the structure 

58 # We just save the residue index and wait for the structure to be set 

59 if not self.structure: 

60 self._residue_index = new_residue_index 

61 return 

62 # Relational indices are updated through a top-down hierarchy 

63 # Affected residues are the ones to update this atom internal residue index 

64 current_residue = self.residue 

65 current_residue.remove_atom(self) 

66 new_residue = self.structure.residues[new_residue_index] 

67 new_residue.add_atom(self) 

68 residue_index = property(get_residue_index, set_residue_index, None, "The atom residue index according to parent structure residues") 

69 

70 # The atom residue 

71 # If residue is set then make changes in all the structure to make this change coherent 

72 def get_residue(self) -> 'Residue': 

73 # If there is not strucutre yet it means the atom is beeing set before the structure 

74 # In this case it is not possible to get related residues in the structure 

75 if not self.structure: 

76 return None 

77 # Get the residue in the structure according to the residue index 

78 return self.structure.residues[self.residue_index] 

79 

80 def set_residue(self, new_residue: 'Residue'): 

81 # Find the new residue index and set it as the atom residue index 

82 # Note that the residue must be set in the structure already 

83 new_residue_index = new_residue.index 

84 if new_residue_index is None: 

85 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure') 

86 self.set_residue_index(new_residue_index) 

87 residue = property(get_residue, set_residue, None, "The atom residue") 

88 

89 # The atom chain index according to parent structure chains (read only) 

90 # In order to change the chain index it must be changed in the atom residue 

91 def get_chain_index(self) -> int: 

92 return self.residue.chain_index 

93 chain_index = property(get_chain_index, None, None, "The atom chain index according to parent structure chains (read only)") 

94 

95 # The atom chain(read only) 

96 # In order to change the chain it must be changed in the atom residue 

97 def get_chain(self) -> 'Chain': 

98 # Get the chain in the structure according to the chain index 

99 return self.structure.chains[self.chain_index] 

100 chain = property(get_chain, None, None, "The atom chain (read only)") 

101 

102 

103# A residue 

104class Residue: 

105 def __init__(self, name: Optional[str] = None, number: Optional[int] = None, icode: Optional[str] = None): 

106 self.name = name 

107 self.number = number 

108 self.icode = icode 

109 # Set variables to store references to other related instaces 

110 # These variables will be set further by the structure 

111 self._structure = None 

112 self._index = None 

113 self._atom_indices = [] 

114 self._chain_index = None 

115 

116 def __repr__(self): 

117 return '<Residue ' + self.name + str(self.number) + (self.icode if self.icode else '') + '>' 

118 

119 def __eq__(self, other): 

120 if type(self) != type(other): 

121 return False 

122 return (self._chain_index == other._chain_index and self.number == other.number and self.icode == other.icode) 

123 

124 def __hash__(self): 

125 # WARNING: This is susceptible to duplicated residues 

126 return hash((self._chain_index, self.number, self.icode)) 

127 # WARNING: This is not susceptible to duplicated residues 

128 # return hash(tuple(self._atom_indices)) 

129 

130 def same_inputs_as(self, other) -> bool: 

131 return ( 

132 self.name == other.name and 

133 self.number == other.number and 

134 self.icode == other.icode 

135 ) 

136 

137 # The parent structure (read only) 

138 # This value is set by the structure itself 

139 def get_structure(self): 

140 return self._structure 

141 structure = property(get_structure, None, None, "The parent structure (read only)") 

142 

143 # The residue index according to parent structure residues (read only) 

144 # This value is set by the structure itself 

145 def get_index(self): 

146 return self._index 

147 index = property(get_index, None, None, "The residue index according to parent structure residues(read only)") 

148 

149 # The atom indices according to parent structure atoms for atoms in this residue 

150 # If atom indices are set then make changes in all the structure to make this change coherent 

151 def get_atom_indices(self) -> List[int]: 

152 return self._atom_indices 

153 

154 def set_atom_indices(self, new_atom_indices: List[int]): 

155 # If there is not strucutre yet it means the residue is beeing set before the structure 

156 # We just save atom indices and wait for the structure to be set 

157 if not self.structure: 

158 self._atom_indices = new_atom_indices 

159 return 

160 # Update the current atoms 

161 for atom in self.atoms: 

162 atom._residue_index = None 

163 # Update the new atoms 

164 for index in new_atom_indices: 

165 atom = self.structure.atoms[index] 

166 atom._residue_index = self.index 

167 # Now new indices are coherent and thus we can save them 

168 self._atom_indices = new_atom_indices 

169 atom_indices = property(get_atom_indices, set_atom_indices, None, "The atom indices according to parent structure atoms for atoms in this residue") 

170 

171 # The atoms in this residue 

172 # If atoms are set then make changes in all the structure to make this change coherent 

173 def get_atoms(self) -> List['Atom']: 

174 # If there is not strucutre yet it means the chain is beeing set before the structure 

175 # In this case it is not possible to get related atoms in the structure 

176 if not self.structure: 

177 return [] 

178 # Get atoms in the structure according to atom indices 

179 atoms = self.structure.atoms 

180 return [atoms[atom_index] for atom_index in self.atom_indices] 

181 

182 def set_atoms(self, new_atoms: List['Atom']): 

183 # Find indices for new atoms and set their indices as the new atom indices 

184 # Note that atoms must be set in the structure already 

185 new_atom_indices = [] 

186 for new_atom in new_atoms: 

187 new_atom_index = new_atom.index 

188 if new_atom_index is None: 

189 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure') 

190 new_atom_indices.append(new_atom_index) 

191 self.set_atom_indices(new_atom_indices) 

192 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue") 

193 

194 # Add an atom to the residue 

195 def add_atom(self, new_atom: 'Atom'): 

196 # Insert the new atom index in the list of atom indices keeping the order 

197 new_atom_index = new_atom.index 

198 sorted_atom_index = bisect(self.atom_indices, new_atom_index) 

199 self.atom_indices.insert(sorted_atom_index, new_atom_index) 

200 # Update the atom internal index 

201 new_atom._residue_index = self.index 

202 

203 # Remove an atom from the residue 

204 def remove_atom(self, current_atom: 'Atom'): 

205 # Remove the current atom index from the atom indices list 

206 self.atom_indices.remove(current_atom.index) # This index MUST be in the list 

207 # Update the atom internal index 

208 current_atom._residue_index = None 

209 

210 # The residue chain index according to parent structure chains 

211 # If chain index is set then make changes in all the structure to make this change coherent 

212 def get_chain_index(self) -> int: 

213 return self._chain_index 

214 

215 def set_chain_index(self, new_chain_index: int): 

216 # If there is not strucutre yet it means the chain is beeing set before the structure 

217 # We just save the chain index and wait for the structure to be set 

218 if not self.structure: 

219 self._chain_index = new_chain_index 

220 return 

221 # Relational indices are updated through a top-down hierarchy 

222 # Affected chains are the ones to update this residue internal chain index 

223 # WARNING: It is critical to find both current and new chains before removing/adding residues 

224 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged 

225 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed 

226 current_chain = self.chain 

227 new_chain = self.structure.chains[new_chain_index] 

228 current_chain.remove_residue(self) 

229 new_chain.add_residue(self) 

230 chain_index = property(get_chain_index, set_chain_index, None, "The residue chain index according to parent structure chains") 

231 

232 # The residue chain 

233 # If chain is set then make changes in all the structure to make this change coherent 

234 def get_chain(self) -> 'Chain': 

235 # If there is not strucutre yet it means the residue is beeing set before the structure 

236 # In this case it is not possible to get related chain in the structure 

237 if not self.structure: 

238 return [] 

239 # Get the chain in the structure according to the chain index 

240 return self.structure.chains[self.chain_index] 

241 

242 def set_chain(self, new_chain: Union['Chain', str]): 

243 # In case the chain is just a string we must find/create the corresponding chain 

244 if type(new_chain) == str: 

245 letter = new_chain 

246 # Get the residue structure 

247 structure = self.structure 

248 if not structure: 

249 raise ValueError('Cannot find the corresponding ' + new_chain + ' chain without the structure') 

250 # Find if the letter belongs to an already existing chain 

251 new_chain = structure.get_chain_by_name(letter) 

252 # If the chain does not exist yet then create it 

253 if not new_chain: 

254 new_chain = Chain(name=letter) 

255 structure.set_new_chain(new_chain) 

256 # Find the new chain index and set it as the residue chain index 

257 # Note that the chain must be set in the structure already 

258 new_chain_index = new_chain.index 

259 if new_chain_index is None: 

260 raise ValueError('Chain ' + str(new_chain) + ' is not set in the structure') 

261 self.set_chain_index(new_chain_index) 

262 chain = property(get_chain, set_chain, None, "The residue chain") 

263 

264 

265# A chain 

266class Chain: 

267 def __init__(self, name: Optional[str] = None): 

268 self.name = name 

269 # Set variables to store references to other related instaces 

270 # These variables will be set further by the structure 

271 self._structure = None 

272 self._index = None 

273 self.residue_indices = [] 

274 

275 def __repr__(self): 

276 return '<Chain ' + self.name + '>' 

277 

278 def __eq__(self, other): 

279 return self.name == other.name 

280 

281 # The parent structure(read only) 

282 # This value is set by the structure itself 

283 def get_structure(self): 

284 return self._structure 

285 structure = property(get_structure, None, None, "The parent structure(read only)") 

286 

287 # The residue index according to parent structure residues(read only) 

288 # This value is set by the structure itself 

289 def get_index(self): 

290 return self._index 

291 

292 # When the index is set all residues are updated with the nex chain index 

293 def set_index(self, index: int): 

294 for residue in self.residues: 

295 residue._chain_index = index 

296 self._index = index 

297 index = property(get_index, set_index, None, "The residue index according to parent structure residues(read only)") 

298 

299 # The residue indices according to parent structure residues for residues in this chain 

300 # If residue indices are set then make changes in all the structure to make this change coherent 

301 def get_residue_indices(self) -> List[int]: 

302 return self._residue_indices 

303 

304 def set_residue_indices(self, new_residue_indices: List[int]): 

305 # If there is not strucutre yet it means the chain is beeing set before the structure 

306 # We just save residue indices and wait for the structure to be set 

307 if not self.structure: 

308 self._residue_indices = new_residue_indices 

309 return 

310 # Update the current residues 

311 for residue in self.residues: 

312 residue._chain_index = None 

313 # Update the new residues 

314 for index in new_residue_indices: 

315 residue = self.structure.residues[index] 

316 residue._chain_index = self.index 

317 # In case the new residue indices list is empty this chain must be removed from its structure 

318 if len(new_residue_indices) == 0: 

319 self.structure.purge_chain(self) 

320 # Now new indices are coherent and thus we can save them 

321 self._residue_indices = new_residue_indices 

322 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue") 

323 

324 # The residues in this chain 

325 # If residues are set then make changes in all the structure to make this change coherent 

326 def get_residues(self) -> List['Residue']: 

327 # If there is not strucutre yet it means the chain is beeing set before the structure 

328 # In this case it is not possible to get related residues in the structure 

329 if not self.structure: 

330 return [] 

331 # Get residues in the structure according to residue indices 

332 residues = self.structure.residues 

333 return [residues[residue_index] for residue_index in self.residue_indices] 

334 

335 def set_residues(self, new_residues: List['Residue']): 

336 # Find indices for new residues and set their indices as the new residue indices 

337 # Note that residues must be set in the structure already 

338 new_residue_indices = [] 

339 for new_residue in new_residues: 

340 new_residue_index = new_residue.index 

341 if new_residue_index is None: 

342 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure') 

343 new_residue_indices.append(new_residue_index) 

344 self.set_residue_indices(new_residue_indices) 

345 residues = property(get_residues, set_residues, None, "The residues in this chain") 

346 

347 # Add a residue to the chain 

348 def add_residue(self, residue: 'Residue'): 

349 # Insert the new residue index in the list of residue indices keeping the order 

350 sorted_residue_index = bisect(self.residue_indices, residue.index) 

351 self.residue_indices.insert(sorted_residue_index, residue.index) 

352 # Update the residue internal chain index 

353 residue._chain_index = self.index 

354 

355 # Remove a residue from the chain 

356 # WARNING: Note that this function does not trigger the set_residue_indices 

357 def remove_residue(self, residue: 'Residue'): 

358 self.residue_indices.remove(residue.index) # This index MUST be in the list 

359 # If we removed the last index then this chain must be removed from its structure 

360 if len(self.residue_indices) == 0 and self.structure: 

361 self.structure.purge_chain(self) 

362 # Update the residue internal chain index 

363 residue._chain_index = None 

364 

365 # Atom indices for all atoms in the chain(read only) 

366 # In order to change atom indices they must be changed in their corresponding residues 

367 def get_atom_indices(self) -> List[int]: 

368 return sum([residue.atom_indices for residue in self.residues], []) 

369 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain(read only)") 

370 

371 # Atoms in the chain(read only) 

372 # In order to change atoms they must be changed in their corresponding residues 

373 def get_atoms(self) -> List[int]: 

374 return sum([residue.atoms for residue in self.residues], []) 

375 atoms = property(get_atoms, None, None, "Atoms in the chain(read only)") 

376 

377 # Get the residues sequence in one-letter code 

378 def get_sequence(self) -> str: 

379 return ''.join([residue_name_2_letter(residue.name) for residue in self.residues]) 

380 

381 

382# A structure is a group of atoms organized in chains and residues 

383class Structure: 

384 def __init__(self, atoms: List['Atom'] = [], residues: List['Residue'] = [], chains: List['Chain'] = []): 

385 self.atoms = [] 

386 self.residues = [] 

387 self.chains = [] 

388 # Set references between instances 

389 for atom in atoms: 

390 self.set_new_atom(atom) 

391 for residue in residues: 

392 self.set_new_residue(residue) 

393 for chain in chains: 

394 self.set_new_chain(chain) 

395 

396 def __repr__(self): 

397 return '<Structure(' + str(len(self.atoms)) + ' atoms)>' 

398 

399 # Set a new atom in the structure 

400 def set_new_atom(self, atom: 'Atom'): 

401 atom._structure = self 

402 new_atom_index = len(self.atoms) 

403 self.atoms.append(atom) 

404 atom._index = new_atom_index 

405 

406 # Set a new residue in the structure 

407 # WARNING: Atoms must be set already before setting residues 

408 def set_new_residue(self, residue: 'Residue'): 

409 residue._structure = self 

410 new_residue_index = len(self.residues) 

411 self.residues.append(residue) 

412 residue._index = new_residue_index 

413 # In case the residue has atom indices, set relational indices on each atom 

414 for atom_index in residue.atom_indices: 

415 atom = self.atoms[atom_index] 

416 atom._residue_index = new_residue_index 

417 

418 # Set a new chain in the structure 

419 # WARNING: Residues and atoms must be set already before setting chains 

420 def set_new_chain(self, chain: 'Chain'): 

421 chain._structure = self 

422 new_chain_index = len(self.chains) 

423 self.chains.append(chain) 

424 chain._index = new_chain_index 

425 # In case the chain has residue indices, set relational indices on each residue 

426 for residue_index in chain.residue_indices: 

427 residue = self.residues[residue_index] 

428 residue._chain_index = new_chain_index 

429 

430 # Purge chain from the structure 

431 # This can be done only when the chain has no residues left in the structure 

432 # Renumerate all chain indices which have been offsetted as a result of the purge 

433 def purge_chain(self, chain: 'Chain'): 

434 # Check the chain can be purged 

435 if chain not in self.chains: 

436 raise ValueError('Chain ' + chain.name + ' is not in the structure already') 

437 if len(chain.residue_indices) > 0: 

438 raise ValueError('Chain ' + chain.name + ' is still having residues and thus it cannot be purged') 

439 # Get the current index of the chain to be purged 

440 purged_index = chain.index 

441 # Chains and their residues below this index are not to be modified 

442 # Chains and their residues over this index must be renumerated 

443 for affected_chain in self.chains[purged_index+1:]: 

444 # Chaging the index automatically changes all chain residues '_chain_index' values. 

445 affected_chain.index -= 1 

446 # Finally, remove the current chain from the list of chains in the structure 

447 self.chains.remove(chain) 

448 

449 # Set the structure from a pdb file 

450 @classmethod 

451 def from_pdb_file(cls, pdb_filename: str): 

452 if not os.path.exists(pdb_filename): 

453 raise SystemExit('File "' + pdb_filename + '" not found') 

454 # Read the pdb file line by line and set the parsed atoms, residues and chains 

455 parsed_atoms = [] 

456 parsed_residues = [] 

457 parsed_chains = [] 

458 atom_index = -1 

459 residue_index = -1 

460 with open(pdb_filename, 'r') as file: 

461 for line in file: 

462 # Parse atoms only 

463 start = line[0:6] 

464 is_atom = start == 'ATOM ' or start == 'HETATM' 

465 if not is_atom: 

466 continue 

467 # Mine all atom data 

468 atom_name = line[11:16].strip() 

469 residue_name = line[17:21].strip() 

470 chain = line[21:22] 

471 residue_number = int(line[22:26]) 

472 icode = line[26:27] 

473 if icode == ' ': 

474 icode = '' 

475 x_coord = float(line[30:38]) 

476 y_coord = float(line[38:46]) 

477 z_coord = float(line[46:54]) 

478 element = line[77:79].strip() 

479 # Set the parsed atom, residue and chain 

480 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)) 

481 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode) 

482 parsed_chain = Chain(name=chain) 

483 # Add the parsed atom to the list and update the current atom index 

484 parsed_atoms.append(parsed_atom) 

485 atom_index += 1 

486 # Check if we are in the same chain/residue than before 

487 same_chain = parsed_chains and parsed_chains[-1] == parsed_chain 

488 same_residue = same_chain and parsed_residues and parsed_residue.same_inputs_as(parsed_residues[-1]) 

489 # Update the residue atom indices 

490 # If the residue equals the last parsed residue then use the previous instead 

491 if same_residue: 

492 parsed_residue = parsed_residues[-1] 

493 parsed_residue.atom_indices.append(atom_index) 

494 # If it is the same residue then it will be the same chain as well so we can proceed 

495 continue 

496 # Otherwise, include the new residue in the list and update the current residue index 

497 parsed_residues.append(parsed_residue) 

498 residue_index += 1 

499 parsed_residue.atom_indices.append(atom_index) 

500 # If the chain equals the last parsed chain then use the previous instead 

501 if same_chain: 

502 parsed_chain = parsed_chains[-1] 

503 parsed_chain.residue_indices.append(residue_index) 

504 continue 

505 # Otherwise, include the new chain in the list 

506 parsed_chains.append(parsed_chain) 

507 parsed_chain.residue_indices.append(residue_index) 

508 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains) 

509 

510 # Fix atom elements by gueesing them when missing 

511 # Set all elements with the first letter upper and the second(if any) lower 

512 def fix_atom_elements(self): 

513 for atom in self.atoms: 

514 # Make sure elements have the first letter cap and the second letter not cap 

515 if atom.element: 

516 atom.element = first_cap_only(atom.element) 

517 # If elements are missing then guess them from atom names 

518 else: 

519 atom.element = guess_name_element(atom.name) 

520 

521 # Generate a pdb file with current structure 

522 def generate_pdb_file(self, pdb_filename: str): 

523 with open(pdb_filename, "w") as file: 

524 file.write('REMARK mdtoolbelt generated pdb file\n') 

525 for a, atom in enumerate(self.atoms): 

526 residue = atom.residue 

527 index = str((a+1) % 100000).rjust(5) 

528 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name 

529 residue_name = residue.name.ljust(4) 

530 chain = atom.chain.name.rjust(1) 

531 residue_number = str(residue.number).rjust(4) 

532 icode = residue.icode.rjust(1) 

533 coords = atom.coords 

534 x_coord, y_coord, z_coord = ["{:.3f}".format(coord).rjust(8) for coord in coords] 

535 occupancy = '1.00' # Just a placeholder 

536 temp_factor = '0.00' # Just a placeholder 

537 element = atom.element 

538 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name + chain + residue_number + icode + ' ' + x_coord + y_coord + z_coord + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n' 

539 file.write(atom_line) 

540 

541 # Get a chain by its name 

542 def get_chain_by_name(self, name: str) -> 'Chain': 

543 return next((c for c in self.chains if c.name == name), None) 

544 

545 # Get a summary of the structure 

546 def display_summary(self): 

547 print('Atoms: ' + str(len(self.atoms))) 

548 print('Residues: ' + str(len(self.residues))) 

549 print('Chains: ' + str(len(self.chains))) 

550 for chain in self.chains: 

551 print('Chain ' + chain.name + ' (' + str(len(chain.residue_indices)) + ' residues)') 

552 print(' -> ' + chain.get_sequence()) 

553 

554 # This is an alternative system to find protein chains(anything else is chained as 'X') 

555 # This system does not depend on VMD 

556 # It totally overrides previous chains since it is expected to be used only when chains are missing 

557 def raw_protein_chainer(self): 

558 current_chain = self.get_next_available_chain_name() 

559 previous_alpha_carbon = None 

560 for residue in self.residues: 

561 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None) 

562 if not alpha_carbon: 

563 residue.set_chain('X') 

564 continue 

565 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms 

566 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4 

567 if not residues_are_connected: 

568 current_chain = self.get_next_available_chain_name() 

569 residue.set_chain(current_chain) 

570 previous_alpha_carbon = alpha_carbon 

571 

572 # Get the next available chain name 

573 # Find alphabetically the first letter which is not yet used as a chain name 

574 # If all letters in the alphabet are used already then return None 

575 def get_next_available_chain_name(self) -> Optional[str]: 

576 current_chain_names = [chain.name for chain in self.chains] 

577 return next((name for name in available_caps if name not in current_chain_names), None) 

578 

579 

580# Calculate the distance between two atoms 

581def calculate_distance(atom_1: Atom, atom_2: Atom) -> float: 

582 squared_distances_sum = 0 

583 for i in {'x': 0, 'y': 1, 'z': 2}.values(): 

584 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2 

585 return math.sqrt(squared_distances_sum) 

586 

587 

588# Set all available chains according to pdb standards 

589available_caps = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 

590 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] 

591 

592# Protein residues 

593aminoacids = { 

594 "ALA": "A", 

595 "ALAN": "A", 

596 "ALAC": "A", 

597 "ARG": "R", 

598 "ARGN": "R", 

599 "ARGC": "R", 

600 "ASN": "N", 

601 "ASNN": "N", 

602 "ASNC": "N", 

603 "ASP": "D", 

604 "ASPN": "D", 

605 "ASPC": "D", 

606 "CYS": "C", 

607 "CYSN": "C", 

608 "CYSC": "C", 

609 "CYH": "C", 

610 "CSH": "C", 

611 "CSS": "C", 

612 "CYX": "C", 

613 "CYP": "C", 

614 "GLN": "Q", 

615 "GLNN": "Q", 

616 "GLNC": "Q", 

617 "GLU": "E", 

618 "GLUN": "E", 

619 "GLUC": "E", 

620 "GLY": "G", 

621 "GLYN": "G", 

622 "GLYC": "G", 

623 "HIS": "H", 

624 "HISN": "H", 

625 "HISC": "H", 

626 "HID": "H", 

627 "HIE": "H", 

628 "HIP": "H", 

629 "HSD": "H", 

630 "HSE": "H", 

631 "ILE": "I", 

632 "ILEN": "I", 

633 "ILEC": "I", 

634 "ILU": "I", 

635 "LEU": "L", 

636 "LEUN": "L", 

637 "LEUC": "L", 

638 "LYS": "K", 

639 "LYSN": "K", 

640 "LYSC": "K", 

641 "MET": "M", 

642 "METN": "M", 

643 "METC": "M", 

644 "PHE": "F", 

645 "PHEN": "F", 

646 "PHEC": "F", 

647 "PRO": "P", 

648 "PRON": "P", 

649 "PROC": "P", 

650 "PRØ": "P", 

651 "PR0": "P", 

652 "PRZ": "P", 

653 "SER": "S", 

654 "SERN": "S", 

655 "SERC": "S", 

656 "THR": "T", 

657 "THRN": "T", 

658 "THRC": "R", 

659 "TRP": "W", 

660 "TRPN": "W", 

661 "TRPC": "W", 

662 "TRY": "W", 

663 "TYR": "Y", 

664 "TYRN": "Y", 

665 "TYRC": "Y", 

666 "VAL": "V", 

667 "VALN": "V", 

668 "VALC": "V" 

669} 

670 

671# Nucleic acid residues 

672nucleotides = { 

673 "A": "A", 

674 "A3": "A", 

675 "A5": "A", 

676 "C": "C", 

677 "C3": "C", 

678 "C5": "C", 

679 "T": "T", 

680 "T3": "T", 

681 "T5": "T", 

682 "G": "G", 

683 "G3": "G", 

684 "G5": "G", 

685 "U": "U", 

686 "U3": "U", 

687 "U5": "U", 

688 "DA": "A", 

689 "DT": "T", 

690 "DC": "C", 

691 "DG": "G", 

692} 

693 

694# All supported residues 

695supported_residues = {**aminoacids, **nucleotides} 

696 

697 

698# Transform a residue name to its equivalent single letter code 

699# If the residue name is not recognized then return "X" 

700# e.g. "ARG" -> "R", "WTF" -> "X" 

701# You can choose which residue types are targeted(e.g. aminoacids only) 

702# Options are: 'all', 'aminoacids' or 'nucleotides' 

703# All residue types are targeted by default 

704def residue_name_2_letter(residue_name: str, residue_types: str = "all") -> str: 

705 # Set the target residues 

706 if residue_types == "all": 

707 target_residues = supported_residues 

708 elif residue_types == "aminoacids": 

709 target_residues = aminoacids 

710 elif residue_types == "nucleotides": 

711 target_residues = nucleotides 

712 else: 

713 raise ValueError('Unrecognized residue types ' + str(residue_types)) 

714 # Now find the corresponding letter among the target residues 

715 ref = target_residues.get(residue_name, False) 

716 return ref if ref else "X" 

717 

718 

719# Map the structure aminoacids sequences against the standard reference sequences 

720# References are uniprot accession ids and they are optional 

721# For each reference, align the reference sequence with the topology sequence 

722# Chains which do not match any reference sequence will be blasted 

723# Note that an internet connection is required both to retireve the uniprot reference sequence and to do the blast 

724# NEVER FORGET: This system relies on the fact that topology chains are not repeated 

725def generate_map_online(structure: 'Structure', forced_references: List[str] = []) -> Optional[dict]: 

726 # Store all the references which are got through this process 

727 # Note that not all references may be used at the end 

728 references = {} 

729 # Get the structure chain sequences 

730 structure_sequences = get_chain_sequences(structure) 

731 # Find out which chains are protein 

732 protein_sequences = [] 

733 for structure_sequence in structure_sequences: 

734 sequence = structure_sequence['sequence'] 

735 if next((letter for letter in sequence if letter != 'X'), None): 

736 structure_sequence['match'] = {'ref': None, 'map': None, 'score': 0} 

737 protein_sequences.append(structure_sequence) 

738 # For each input forced reference, get the reference sequence 

739 reference_sequences = {} 

740 if forced_references: 

741 # Check forced_references is not a string 

742 if type(forced_references) == str: 

743 raise TypeError('Forced references should be a list and not a string a this point') 

744 for uniprot_id in forced_references: 

745 reference = get_uniprot_reference(uniprot_id) 

746 reference_sequences[reference['uniprot']] = reference['sequence'] 

747 # Save the current whole reference object for later 

748 references[reference['uniprot']] = reference 

749 

750 # Try to match all protein sequences with the available reference sequences 

751 # In case of match, objects in the 'protein_sequences' list are modified by adding the result 

752 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

753 def match_sequences() -> bool: 

754 # Track each chain-reference alignment match and keep the score of successful alignments 

755 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum) 

756 for structure_sequence in protein_sequences: 

757 for uniprot_id, reference_sequence in reference_sequences.items(): 

758 # Align the structure sequence with the reference sequence 

759 align_results = align(reference_sequence, structure_sequence['sequence']) 

760 if not align_results: 

761 continue 

762 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any) 

763 sequence_map, align_score = align_results 

764 current_reference = structure_sequence['match'] 

765 if current_reference['score'] > align_score: 

766 continue 

767 reference = references[uniprot_id] 

768 # If the alignment is better then we impose the new reference 

769 structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score} 

770 # Sum up the current matching 

771 print('Reference summary:') 

772 for structure_sequence in structure_sequences: 

773 name = structure_sequence['name'] 

774 match = structure_sequence.get('match', None) 

775 if not match: 

776 print(' ' + name + ' -> Not protein') 

777 continue 

778 reference = structure_sequence['match'].get('ref', None) 

779 if not reference: 

780 print(' ' + name + ' -> ¿?') 

781 continue 

782 uniprot_id = reference['uniprot'] 

783 print(' ' + name + ' -> ' + uniprot_id) 

784 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

785 return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences]) 

786 # If we have every protein chain matched with a reference already then we stop here 

787 if match_sequences(): 

788 return format_topology_data(structure, protein_sequences) 

789 # If there are still any chain which is not matched with a reference then we need more references 

790 # To get them, we run a blast with each orphan chain sequence 

791 for structure_sequence in protein_sequences: 

792 # Skip already references chains 

793 if structure_sequence['match']['ref']: 

794 continue 

795 # Run the blast 

796 sequence = structure_sequence['sequence'] 

797 uniprot_id = blast(sequence) 

798 # Build a new reference from the resulting uniprot 

799 reference = get_uniprot_reference(uniprot_id) 

800 reference_sequences[reference['uniprot']] = reference['sequence'] 

801 # Save the current whole reference object for later 

802 references[reference['uniprot']] = reference 

803 # If we have every protein chain matched with a reference already then we stop here 

804 if match_sequences(): 

805 return format_topology_data(structure, protein_sequences) 

806 print('The BLAST failed to find a matching reference sequence for at least one protein sequence') 

807 return None 

808 

809 

810# Try to match all protein sequences with the available reference sequences 

811# In case of match, objects in the 'protein_sequences' list are modified by adding the result 

812# Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

813def match_sequences(protein_sequences: list, reference_sequences: dict) -> bool: 

814 # Track each chain-reference alignment match and keep the score of successful alignments 

815 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum) 

816 for structure_sequence in protein_sequences: 

817 for uniprot_id, reference_sequence in reference_sequences.items(): 

818 # Align the structure sequence with the reference sequence 

819 align_results = align(reference_sequence, structure_sequence['sequence']) 

820 if not align_results: 

821 continue 

822 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any) 

823 sequence_map, align_score = align_results 

824 current_reference = structure_sequence['match'] 

825 if current_reference['score'] > align_score: 

826 continue 

827 reference = references[uniprot_id] 

828 

829 # If the alignment is better then we impose the new reference 

830 structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score} 

831 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

832 return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences]) 

833 

834 

835# Reformat mapping data to the topology system(introduced later) 

836def format_topology_data(structure: 'Structure', mapping_data: list) -> dict: 

837 # Get the count of residues from the structure 

838 residues_count = len(structure.residues) 

839 # Now format data 

840 reference_ids = [] 

841 residue_reference_indices = [None] * residues_count 

842 residue_reference_numbers = [None] * residues_count 

843 for data in mapping_data: 

844 match = data['match'] 

845 # Get the reference index 

846 # Note that several matches may belong to the same reference and thus have the same index 

847 reference = match['ref'] 

848 uniprot_id = reference['uniprot'] 

849 if uniprot_id not in reference_ids: 

850 reference_ids.append(reference['uniprot']) 

851 reference_index = reference_ids.index(uniprot_id) 

852 residue_indices = data['residue_indices'] 

853 for r, residue_number in enumerate(match['map']): 

854 if residue_number is None: 

855 continue 

856 residue_index = residue_indices[r] 

857 residue_reference_indices[residue_index] = reference_index 

858 residue_reference_numbers[residue_index] = residue_number 

859 # If there are not references at the end then set all fields as None, in order to save space 

860 if len(reference_ids) == 0: 

861 reference_ids = None 

862 residue_reference_indices = None 

863 residue_reference_numbers = None 

864 # Return the 3 topology fields as they are in the database 

865 residues_map = {'references': reference_ids, 'residue_reference_indices': residue_reference_indices, 'residue_reference_numbers': residue_reference_numbers} 

866 return residues_map 

867 

868 

869# Align a reference aminoacid sequence with each chain sequence in a topology 

870# NEVER FORGET: This system relies on the fact that topology chains are not repeated 

871def map_sequence(ref_sequence: str, structure: 'Structure') -> list: 

872 sequences = get_chain_sequences(structure) 

873 mapping = [] 

874 for s in sequences: 

875 sequence = sequences[s] 

876 sequence_map = align(ref_sequence, sequence) 

877 mapping += sequence_map 

878 return mapping 

879 

880 

881# Get each chain name and aminoacids sequence in a topology 

882# Output format example: [{'sequence': 'VNLTT', 'indices': [1, 2, 3, 4, 5]}, ...] 

883def get_chain_sequences(structure: 'Structure') -> list: 

884 sequences = [] 

885 chains = structure.chains 

886 for chain in chains: 

887 name = chain.name 

888 sequence = '' 

889 residue_indices = [] 

890 for residue in chain.residues: 

891 letter = residue_name_2_letter(residue.name, 'aminoacids') 

892 sequence += letter 

893 residue_indices.append(residue.index) 

894 # Save sequences by chain name (i.e. chain id or chain letter) 

895 sequence_object = {'name': name, 'sequence': sequence, 'residue_indices': residue_indices} 

896 sequences.append(sequence_object) 

897 return sequences 

898 

899 

900# Align two aminoacid sequences 

901# Return a list with the reference residue indexes (values) 

902# which match each new sequence residues indexes (indexes) 

903# Return also the score of the alignment 

904# Return None when there is not valid alignment at all 

905def align(ref_sequence: str, new_sequence: str) -> Optional[Tuple[list, float]]: 

906 

907 # print('- REFERENCE\n' + ref_sequence + '\n- NEW\n' + new_sequence) 

908 

909 # If the new sequence is all 'X' stop here, since this would make the alignment infinite 

910 # Then an array filled with None is returned 

911 if all([letter == 'X' for letter in new_sequence]): 

912 return None 

913 

914 # Return the new sequence as best aligned as possible with the reference sequence 

915 alignments = pairwise2.align.localds(ref_sequence, new_sequence, blosum62, -10, -0.5) 

916 # DANI: Habría que hacerlo de esta otra forma según el deprecation warning (arriba hay más código) 

917 # DANI: El problema es que el output lo tiene todo menos la sequencia en formato alienada 

918 # DANI: i.e. formato '----VNLTT', que es justo el que necesito 

919 # alignments = aligner.align(ref_sequence, new_sequence) 

920 

921 # In case there are no alignments it means the current chain has nothing to do with this reference 

922 # Then an array filled with None is returned 

923 if len(alignments) == 0: 

924 return None 

925 

926 # Several alignments may be returned, specially when it is a difficult or impossible alignment 

927 # Output format example: '----VNLTT' 

928 best_alignment = alignments[0] 

929 aligned_sequence = best_alignment[1] 

930 print(format_alignment(*alignments[0])) 

931 score = alignments[0][2] 

932 # WARNING: Do not use 'aligned_sequence' length here since it has the total sequence length 

933 normalized_score = score / len(new_sequence) 

934 print('Normalized score: ' + str(normalized_score)) 

935 

936 # If the normalized score does not reaches the minimum we consider the alignment is not valid 

937 # It may happen when the reference goes for a specific chain but we must map all chains 

938 # This 1 has been found experimentally 

939 # Non maching sequence may return a 0.1-0.3 normalized score 

940 # Matching sequence may return >4 normalized score 

941 if normalized_score < 1: 

942 print('Not valid alignment') 

943 return None 

944 

945 # Match each residue 

946 aligned_mapping = [] 

947 aligned_index = 0 

948 for l_index, letter in enumerate(aligned_sequence): 

949 # Guions are skipped 

950 if letter == '-': 

951 continue 

952 # Get the current residue of the new sequence 

953 equivalent_letter = new_sequence[aligned_index] 

954 if not letter == equivalent_letter: 

955 raise SystemExit('Something was wrong:S') 

956 # 'X' residues cannot be mapped since reference sequences should never have any 'X' 

957 if letter == 'X': 

958 aligned_mapping.append(None) 

959 aligned_index += 1 

960 continue 

961 # Otherwise add the equivalent aligned index to the mapping 

962 # WARNING: Add +1 since uniprot residue counts start at 1, not 0 

963 aligned_mapping.append(l_index + 1) 

964 aligned_index += 1 

965 

966 return aligned_mapping, normalized_score 

967 

968 

969# Given an aminoacids sequence, return a list of uniprot ids 

970# Note that we are blasting against UniProtKB / Swiss-Prot so results will always be valid UniProt accessions 

971# WARNING: This always means results will correspond to curated entries only 

972# If your sequence is from an exotic organism the result may be not from it but from other more studied organism 

973def blast(sequence: str) -> List[str]: 

974 print('Throwing blast... (this may take a few minutes)') 

975 result = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence) 

976 parsed_result = xmltodict.parse(result.read()) 

977 hits = parsed_result['BlastOutput']['BlastOutput_iterations']['Iteration']['Iteration_hits']['Hit'] 

978 # Get the first result only 

979 result = hits[0] 

980 # Return the accession 

981 # DANI: Si algun día tienes problemas porque te falta el '.1' al final del accession puedes sacarlo de Hit_id 

982 accession = result['Hit_accession'] 

983 print('Result: ' + accession) 

984 print(result['Hit_def']) 

985 return accession 

986 

987 

988# Given a uniprot accession, use the uniprot API to request its data and then mine what is needed for the database 

989def get_uniprot_reference(uniprot_accession: str) -> dict: 

990 # Request Uniprot 

991 request_url = 'https://www.ebi.ac.uk/proteins/api/proteins/' + uniprot_accession 

992 try: 

993 with urllib.request.urlopen(request_url) as response: 

994 parsed_response = json.loads(response.read().decode("utf-8")) 

995 # If the accession is not found in UniProt then the id is not valid 

996 except urllib.error.HTTPError as error: 

997 if error.code == 400: 

998 raise ValueError('Something went wrong with the Uniprot request: ' + request_url) 

999 # Get the aminoacids sequence 

1000 sequence = parsed_response['sequence']['sequence'] 

1001 return {'uniprot': uniprot_accession, 'sequence': sequence}