Coverage for biobb_model/model/fix_pdb_utils.py: 64%

530 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 11:32 +0000

1import json 

2import math 

3import os 

4import urllib.request 

5from bisect import bisect 

6from typing import Optional, Union 

7 

8import xmltodict # type: ignore 

9from Bio import pairwise2 # type: ignore 

10from Bio.Blast import NCBIWWW # type: ignore 

11from Bio.pairwise2 import format_alignment # type: ignore 

12 

13Coords = tuple[float, float, float] 

14 

15try: 

16 from Bio.SubsMat import MatrixInfo # type: ignore 

17 

18 blosum62 = MatrixInfo.blosum62 

19except ImportError: 

20 from Bio.Align import substitution_matrices # type: ignore 

21 

22 blosum62 = substitution_matrices.load("BLOSUM62") 

23 

24 

25# An atom 

26class Atom: 

27 def __init__( 

28 self, 

29 name: Optional[str] = None, 

30 element: Optional[str] = None, 

31 coords: Optional[Coords] = None, 

32 ): 

33 self.name = name 

34 self.element = element 

35 self.coords = coords 

36 # Set variables to store references to other related instances 

37 # These variables will be set further by the structure 

38 self._structure: Optional[Structure] = None 

39 self._index: Optional[int] = None 

40 self._residue_index: Optional[int] = None 

41 

42 def __repr__(self): 

43 return "<Atom " + str(self.name) + ">" 

44 

45 def __eq__(self, other): 

46 return self._residue_index == other._residue_index and self.name == other.name 

47 

48 # The parent structure (read only) 

49 # This value is set by the structure itself 

50 def get_structure(self): 

51 return self._structure 

52 

53 structure = property(get_structure, None, None, "The parent structure (read only)") 

54 

55 # The residue index according to parent structure residues (read only) 

56 # This value is set by the structure itself 

57 def get_index(self): 

58 return self._index 

59 

60 index = property( 

61 get_index, 

62 None, 

63 None, 

64 "The residue index according to parent structure residues (read only)", 

65 ) 

66 

67 # The atom residue index according to parent structure residues 

68 # If residue index is set then make changes in all the structure to make this change coherent 

69 def get_residue_index(self) -> Optional[int]: 

70 return self._residue_index 

71 

72 def set_residue_index(self, new_residue_index: int): 

73 # If there is not strucutre yet it means the residue is beeing set before the structure 

74 # We just save the residue index and wait for the structure to be set 

75 if not self.structure: 

76 self._residue_index = new_residue_index 

77 return 

78 # Relational indices are updated through a top-down hierarchy 

79 # Affected residues are the ones to update this atom internal residue index 

80 current_residue = self.residue 

81 current_residue.remove_atom(self) 

82 new_residue = self.structure.residues[new_residue_index] 

83 new_residue.add_atom(self) 

84 

85 residue_index = property( 

86 get_residue_index, 

87 set_residue_index, 

88 None, 

89 "The atom residue index according to parent structure residues", 

90 ) 

91 

92 # The atom residue 

93 # If residue is set then make changes in all the structure to make this change coherent 

94 def get_residue(self) -> Optional["Residue"]: 

95 # If there is not strucutre yet it means the atom is beeing set before the structure 

96 # In this case it is not possible to get related residues in the structure 

97 if not self.structure: 

98 return None 

99 # Get the residue in the structure according to the residue index 

100 return self.structure.residues[self.residue_index] 

101 

102 def set_residue(self, new_residue: "Residue"): 

103 # Find the new residue index and set it as the atom residue index 

104 # Note that the residue must be set in the structure already 

105 new_residue_index = new_residue.index 

106 if new_residue_index is None: 

107 raise ValueError( 

108 "Residue " + str(new_residue) + " is not set in the structure" 

109 ) 

110 self.set_residue_index(new_residue_index) 

111 

112 residue = property(get_residue, set_residue, None, "The atom residue") 

113 

114 # The atom chain index according to parent structure chains (read only) 

115 # In order to change the chain index it must be changed in the atom residue 

116 def get_chain_index(self) -> int: 

117 return self.residue.chain_index 

118 

119 chain_index = property( 

120 get_chain_index, 

121 None, 

122 None, 

123 "The atom chain index according to parent structure chains (read only)", 

124 ) 

125 

126 # The atom chain(read only) 

127 # In order to change the chain it must be changed in the atom residue 

128 def get_chain(self) -> "Chain": 

129 # Get the chain in the structure according to the chain index 

130 return self.structure.chains[self.chain_index] 

131 

132 chain = property(get_chain, None, None, "The atom chain (read only)") 

133 

134 

135# A residue 

136class Residue: 

137 def __init__( 

138 self, 

139 name: Optional[str] = None, 

140 number: Optional[int] = None, 

141 icode: Optional[str] = None, 

142 ): 

143 self.name = name 

144 self.number = number 

145 self.icode = icode 

146 # Set variables to store references to other related instaces 

147 # These variables will be set further by the structure 

148 self._structure: Optional[Structure] = None 

149 self._index: Optional[int] = None 

150 self._atom_indices: list[int] = [] 

151 self._chain_index: Optional[int] = None 

152 

153 def __repr__(self): 

154 return ( 

155 "<Residue " + str(self.name) + str(self.number) + (self.icode if self.icode else "") + ">" 

156 ) 

157 

158 def __eq__(self, other): 

159 if not isinstance(self, type(other)): 

160 return False 

161 return ( 

162 self._chain_index == other._chain_index and self.number == other.number and self.icode == other.icode 

163 ) 

164 

165 def __hash__(self): 

166 # WARNING: This is susceptible to duplicated residues 

167 return hash((self._chain_index, self.number, self.icode)) 

168 # WARNING: This is not susceptible to duplicated residues 

169 # return hash(tuple(self._atom_indices)) 

170 

171 def same_inputs_as(self, other) -> bool: 

172 return ( 

173 self.name == other.name and self.number == other.number and self.icode == other.icode 

174 ) 

175 

176 # The parent structure (read only) 

177 # This value is set by the structure itself 

178 def get_structure(self): 

179 return self._structure 

180 

181 structure = property(get_structure, None, None, "The parent structure (read only)") 

182 

183 # The residue index according to parent structure residues (read only) 

184 # This value is set by the structure itself 

185 def get_index(self): 

186 return self._index 

187 

188 index = property( 

189 get_index, 

190 None, 

191 None, 

192 "The residue index according to parent structure residues(read only)", 

193 ) 

194 

195 # The atom indices according to parent structure atoms for atoms in this residue 

196 # If atom indices are set then make changes in all the structure to make this change coherent 

197 def get_atom_indices(self) -> list[int]: 

198 return self._atom_indices 

199 

200 def set_atom_indices(self, new_atom_indices: list[int]): 

201 # If there is not strucutre yet it means the residue is beeing set before the structure 

202 # We just save atom indices and wait for the structure to be set 

203 if not self.structure: 

204 self._atom_indices = new_atom_indices 

205 return 

206 # Update the current atoms 

207 for atom in self.atoms: 

208 atom._residue_index = None 

209 # Update the new atoms 

210 for index in new_atom_indices: 

211 atom = self.structure.atoms[index] 

212 atom._residue_index = self.index 

213 # Now new indices are coherent and thus we can save them 

214 self._atom_indices = new_atom_indices 

215 

216 atom_indices = property( 

217 get_atom_indices, 

218 set_atom_indices, 

219 None, 

220 "The atom indices according to parent structure atoms for atoms in this residue", 

221 ) 

222 

223 # The atoms in this residue 

224 # If atoms are set then make changes in all the structure to make this change coherent 

225 def get_atoms(self) -> list["Atom"]: 

226 # If there is not strucutre yet it means the chain is beeing set before the structure 

227 # In this case it is not possible to get related atoms in the structure 

228 if not self.structure: 

229 return [] 

230 # Get atoms in the structure according to atom indices 

231 atoms = self.structure.atoms 

232 return [atoms[atom_index] for atom_index in self.atom_indices] 

233 

234 def set_atoms(self, new_atoms: list["Atom"]): 

235 # Find indices for new atoms and set their indices as the new atom indices 

236 # Note that atoms must be set in the structure already 

237 new_atom_indices = [] 

238 for new_atom in new_atoms: 

239 new_atom_index = new_atom.index 

240 if new_atom_index is None: 

241 raise ValueError( 

242 "Atom " + str(new_atom) + " is not set in the structure" 

243 ) 

244 new_atom_indices.append(new_atom_index) 

245 self.set_atom_indices(new_atom_indices) 

246 

247 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue") 

248 

249 # Add an atom to the residue 

250 def add_atom(self, new_atom: "Atom"): 

251 # Insert the new atom index in the list of atom indices keeping the order 

252 new_atom_index = new_atom.index 

253 sorted_atom_index = bisect(self.atom_indices, new_atom_index) 

254 self.atom_indices.insert(sorted_atom_index, new_atom_index) 

255 # Update the atom internal index 

256 new_atom._residue_index = self.index 

257 

258 # Remove an atom from the residue 

259 def remove_atom(self, current_atom: "Atom"): 

260 # Remove the current atom index from the atom indices list 

261 self.atom_indices.remove(current_atom.index) # This index MUST be in the list 

262 # Update the atom internal index 

263 current_atom._residue_index = None 

264 

265 # The residue chain index according to parent structure chains 

266 # If chain index is set then make changes in all the structure to make this change coherent 

267 def get_chain_index(self) -> Optional[int]: 

268 return self._chain_index 

269 

270 def set_chain_index(self, new_chain_index: int): 

271 # If there is not strucutre yet it means the chain is beeing set before the structure 

272 # We just save the chain index and wait for the structure to be set 

273 if not self.structure: 

274 self._chain_index = new_chain_index 

275 return 

276 # Relational indices are updated through a top-down hierarchy 

277 # Affected chains are the ones to update this residue internal chain index 

278 # WARNING: It is critical to find both current and new chains before removing/adding residues 

279 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged 

280 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed 

281 current_chain = self.chain 

282 new_chain = self.structure.chains[new_chain_index] 

283 current_chain.remove_residue(self) 

284 new_chain.add_residue(self) 

285 

286 chain_index = property( 

287 get_chain_index, 

288 set_chain_index, 

289 None, 

290 "The residue chain index according to parent structure chains", 

291 ) 

292 

293 # The residue chain 

294 # If chain is set then make changes in all the structure to make this change coherent 

295 def get_chain(self) -> Union["Chain", list]: 

296 # If there is not strucutre yet it means the residue is beeing set before the structure 

297 # In this case it is not possible to get related chain in the structure 

298 if not self.structure: 

299 return [] 

300 # Get the chain in the structure according to the chain index 

301 return self.structure.chains[self.chain_index] 

302 

303 def set_chain(self, new_chain: Union["Chain", str]): 

304 # In case the chain is just a string we must find/create the corresponding chain 

305 if isinstance(new_chain, str): 

306 letter = new_chain 

307 # Get the residue structure 

308 structure = self.structure 

309 if not structure: 

310 raise ValueError( 

311 "Cannot find the corresponding " + new_chain + " chain without the structure" 

312 ) 

313 # Find if the letter belongs to an already existing chain 

314 new_chain = structure.get_chain_by_name(letter) 

315 # If the chain does not exist yet then create it 

316 if not new_chain: 

317 new_chain = Chain(name=letter) 

318 structure.set_new_chain(new_chain) 

319 # Find the new chain index and set it as the residue chain index 

320 # Note that the chain must be set in the structure already 

321 new_chain_index = new_chain.index 

322 if new_chain_index is None: 

323 raise ValueError("Chain " + str(new_chain) + " is not set in the structure") 

324 if isinstance(new_chain_index, int): 

325 self.set_chain_index(new_chain_index) 

326 

327 chain = property(get_chain, set_chain, None, "The residue chain") 

328 

329 

330# A chain 

331class Chain: 

332 def __init__(self, name: Optional[str] = None): 

333 self.name = name 

334 # Set variables to store references to other related instaces 

335 # These variables will be set further by the structure 

336 self._structure: Optional[Structure] = None 

337 self._index: Optional[int] = None 

338 self.residue_indices = [] 

339 

340 def __repr__(self): 

341 return "<Chain " + str(self.name) + ">" 

342 

343 def __eq__(self, other): 

344 return self.name == other.name 

345 

346 # The parent structure(read only) 

347 # This value is set by the structure itself 

348 def get_structure(self): 

349 return self._structure 

350 

351 structure = property(get_structure, None, None, "The parent structure(read only)") 

352 

353 # The residue index according to parent structure residues(read only) 

354 # This value is set by the structure itself 

355 def get_index(self): 

356 return self._index 

357 

358 # When the index is set all residues are updated with the nex chain index 

359 def set_index(self, index: int): 

360 for residue in self.residues: 

361 residue._chain_index = index 

362 self._index = index 

363 

364 index = property( 

365 get_index, 

366 set_index, 

367 None, 

368 "The residue index according to parent structure residues(read only)", 

369 ) 

370 

371 # The residue indices according to parent structure residues for residues in this chain 

372 # If residue indices are set then make changes in all the structure to make this change coherent 

373 def get_residue_indices(self) -> list[int]: 

374 return self._residue_indices 

375 

376 def set_residue_indices(self, new_residue_indices: list[int]): 

377 # If there is not strucutre yet it means the chain is beeing set before the structure 

378 # We just save residue indices and wait for the structure to be set 

379 if not self.structure: 

380 self._residue_indices = new_residue_indices 

381 return 

382 # Update the current residues 

383 for residue in self.residues: 

384 residue._chain_index = None 

385 # Update the new residues 

386 for index in new_residue_indices: 

387 residue = self.structure.residues[index] 

388 residue._chain_index = self.index 

389 # In case the new residue indices list is empty this chain must be removed from its structure 

390 if len(new_residue_indices) == 0: 

391 self.structure.purge_chain(self) 

392 # Now new indices are coherent and thus we can save them 

393 self._residue_indices = new_residue_indices 

394 

395 residue_indices = property( 

396 get_residue_indices, 

397 set_residue_indices, 

398 None, 

399 "The residue indices according to parent structure residues for residues in this residue", 

400 ) 

401 

402 # The residues in this chain 

403 # If residues are set then make changes in all the structure to make this change coherent 

404 def get_residues(self) -> list["Residue"]: 

405 # If there is not strucutre yet it means the chain is beeing set before the structure 

406 # In this case it is not possible to get related residues in the structure 

407 if not self.structure: 

408 return [] 

409 # Get residues in the structure according to residue indices 

410 residues = self.structure.residues 

411 return [residues[residue_index] for residue_index in self.residue_indices] 

412 

413 def set_residues(self, new_residues: list["Residue"]): 

414 # Find indices for new residues and set their indices as the new residue indices 

415 # Note that residues must be set in the structure already 

416 new_residue_indices = [] 

417 for new_residue in new_residues: 

418 new_residue_index = new_residue.index 

419 if new_residue_index is None: 

420 raise ValueError( 

421 "Residue " + str(new_residue) + " is not set in the structure" 

422 ) 

423 new_residue_indices.append(new_residue_index) 

424 self.set_residue_indices(new_residue_indices) 

425 

426 residues = property(get_residues, set_residues, None, "The residues in this chain") 

427 

428 # Add a residue to the chain 

429 def add_residue(self, residue: "Residue"): 

430 # Insert the new residue index in the list of residue indices keeping the order 

431 sorted_residue_index = bisect(self.residue_indices, residue.index) 

432 self.residue_indices.insert(sorted_residue_index, residue.index) 

433 # Update the residue internal chain index 

434 residue._chain_index = self.index 

435 

436 # Remove a residue from the chain 

437 # WARNING: Note that this function does not trigger the set_residue_indices 

438 def remove_residue(self, residue: "Residue"): 

439 self.residue_indices.remove(residue.index) # This index MUST be in the list 

440 # If we removed the last index then this chain must be removed from its structure 

441 if len(self.residue_indices) == 0 and self.structure: 

442 self.structure.purge_chain(self) 

443 # Update the residue internal chain index 

444 residue._chain_index = None 

445 

446 # Atom indices for all atoms in the chain(read only) 

447 # In order to change atom indices they must be changed in their corresponding residues 

448 def get_atom_indices(self) -> list[int]: 

449 return sum([residue.atom_indices for residue in self.residues], []) 

450 

451 atom_indices = property( 

452 get_atom_indices, 

453 None, 

454 None, 

455 "Atom indices for all atoms in the chain(read only)", 

456 ) 

457 

458 # Atoms in the chain(read only) 

459 # In order to change atoms they must be changed in their corresponding residues 

460 def get_atoms(self) -> list[int]: 

461 return sum([residue.atoms for residue in self.residues], []) 

462 

463 atoms = property(get_atoms, None, None, "Atoms in the chain(read only)") 

464 

465 # Get the residues sequence in one-letter code 

466 def get_sequence(self) -> str: 

467 return "".join( 

468 [residue_name_2_letter(residue.name) for residue in self.residues] 

469 ) 

470 

471 

472# A structure is a group of atoms organized in chains and residues 

473class Structure: 

474 def __init__( 

475 self, 

476 atoms: list["Atom"] = [], 

477 residues: list["Residue"] = [], 

478 chains: list["Chain"] = [], 

479 ): 

480 self.atoms: list[Atom] = [] 

481 self.residues: list[Residue] = [] 

482 self.chains: list[Chain] = [] 

483 # Set references between instances 

484 for atom in atoms: 

485 self.set_new_atom(atom) 

486 for residue in residues: 

487 self.set_new_residue(residue) 

488 for chain in chains: 

489 self.set_new_chain(chain) 

490 

491 def __repr__(self): 

492 return "<Structure(" + str(len(self.atoms)) + " atoms)>" 

493 

494 # Set a new atom in the structure 

495 def set_new_atom(self, atom: "Atom"): 

496 atom._structure = self 

497 new_atom_index = len(self.atoms) 

498 self.atoms.append(atom) 

499 atom._index = new_atom_index 

500 

501 # Set a new residue in the structure 

502 # WARNING: Atoms must be set already before setting residues 

503 def set_new_residue(self, residue: "Residue"): 

504 residue._structure = self 

505 new_residue_index = len(self.residues) 

506 self.residues.append(residue) 

507 residue._index = new_residue_index 

508 # In case the residue has atom indices, set relational indices on each atom 

509 for atom_index in residue.atom_indices: 

510 atom = self.atoms[atom_index] 

511 atom._residue_index = new_residue_index 

512 

513 # Set a new chain in the structure 

514 # WARNING: Residues and atoms must be set already before setting chains 

515 def set_new_chain(self, chain: "Chain"): 

516 chain._structure = self 

517 new_chain_index = len(self.chains) 

518 self.chains.append(chain) 

519 chain._index = new_chain_index 

520 # In case the chain has residue indices, set relational indices on each residue 

521 for residue_index in chain.residue_indices: 

522 residue = self.residues[residue_index] 

523 residue._chain_index = new_chain_index 

524 

525 # Purge chain from the structure 

526 # This can be done only when the chain has no residues left in the structure 

527 # Renumerate all chain indices which have been offsetted as a result of the purge 

528 def purge_chain(self, chain: "Chain"): 

529 # Check the chain can be purged 

530 if chain not in self.chains: 

531 raise ValueError( 

532 "Chain " + str(chain.name) + " is not in the structure already" 

533 ) 

534 if len(chain.residue_indices) > 0: 

535 raise ValueError( 

536 "Chain " + str(chain.name) + " is still having residues and thus it cannot be purged" 

537 ) 

538 # Get the current index of the chain to be purged 

539 purged_index = chain.index 

540 # Chains and their residues below this index are not to be modified 

541 # Chains and their residues over this index must be renumerated 

542 for affected_chain in self.chains[purged_index + 1:]: 

543 # Chaging the index automatically changes all chain residues '_chain_index' values. 

544 affected_chain.index -= 1 

545 # Finally, remove the current chain from the list of chains in the structure 

546 self.chains.remove(chain) 

547 

548 # Set the structure from a pdb file 

549 @classmethod 

550 def from_pdb_file(cls, pdb_filename: str): 

551 if not os.path.exists(pdb_filename): 

552 raise SystemExit('File "' + pdb_filename + '" not found') 

553 # Read the pdb file line by line and set the parsed atoms, residues and chains 

554 parsed_atoms = [] 

555 parsed_residues: list[Residue] = [] 

556 parsed_chains: list[Chain] = [] 

557 atom_index = -1 

558 residue_index = -1 

559 with open(pdb_filename, "r") as file: 

560 for line in file: 

561 # Parse atoms only 

562 start = line[0:6] 

563 is_atom = start == "ATOM " or start == "HETATM" 

564 if not is_atom: 

565 continue 

566 # Mine all atom data 

567 atom_name = line[11:16].strip() 

568 residue_name = line[17:21].strip() 

569 chain = line[21:22] 

570 residue_number = int(line[22:26]) 

571 icode = line[26:27] 

572 if icode == " ": 

573 icode = "" 

574 x_coord = float(line[30:38]) 

575 y_coord = float(line[38:46]) 

576 z_coord = float(line[46:54]) 

577 element = line[77:79].strip() 

578 # Set the parsed atom, residue and chain 

579 parsed_atom = Atom( 

580 name=atom_name, element=element, coords=(x_coord, y_coord, z_coord) 

581 ) 

582 parsed_residue = Residue( 

583 name=residue_name, number=residue_number, icode=icode 

584 ) 

585 parsed_chain = Chain(name=chain) 

586 # Add the parsed atom to the list and update the current atom index 

587 parsed_atoms.append(parsed_atom) 

588 atom_index += 1 

589 # Check if we are in the same chain/residue than before 

590 same_chain = parsed_chains and parsed_chains[-1] == parsed_chain 

591 same_residue = ( 

592 same_chain and parsed_residues and parsed_residue.same_inputs_as(parsed_residues[-1]) 

593 ) 

594 # Update the residue atom indices 

595 # If the residue equals the last parsed residue then use the previous instead 

596 if same_residue: 

597 parsed_residue = parsed_residues[-1] 

598 parsed_residue.atom_indices.append(atom_index) 

599 # If it is the same residue then it will be the same chain as well so we can proceed 

600 continue 

601 # Otherwise, include the new residue in the list and update the current residue index 

602 parsed_residues.append(parsed_residue) 

603 residue_index += 1 

604 parsed_residue.atom_indices.append(atom_index) 

605 # If the chain equals the last parsed chain then use the previous instead 

606 if same_chain: 

607 parsed_chain = parsed_chains[-1] 

608 parsed_chain.residue_indices.append(residue_index) 

609 continue 

610 # Otherwise, include the new chain in the list 

611 parsed_chains.append(parsed_chain) 

612 parsed_chain.residue_indices.append(residue_index) 

613 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains) 

614 

615 # Fix atom elements by gueesing them when missing 

616 # Set all elements with the first letter upper and the second(if any) lower 

617 # def fix_atom_elements(self): 

618 # for atom in self.atoms: 

619 # # Make sure elements have the first letter cap and the second letter not cap 

620 # if atom.element: 

621 # atom.element = first_cap_only(atom.element) 

622 # # If elements are missing then guess them from atom names 

623 # else: 

624 # atom.element = guess_name_element(atom.name) 

625 

626 # Generate a pdb file with current structure 

627 def generate_pdb_file(self, pdb_filename: str): 

628 with open(pdb_filename, "w") as file: 

629 file.write("REMARK mdtoolbelt generated pdb file\n") 

630 for a, atom in enumerate(self.atoms): 

631 residue = atom.residue 

632 index = str((a + 1) % 100000).rjust(5) 

633 name = ( 

634 " " + str(atom.name).ljust(3) 

635 if len(str(atom.name)) < 4 

636 else atom.name 

637 ) 

638 residue_name = residue.name.ljust(4) 

639 chain = atom.chain.name.rjust(1) 

640 residue_number = str(residue.number).rjust(4) 

641 icode = residue.icode.rjust(1) 

642 coords: Optional[Coords] = atom.coords 

643 if not coords: 

644 raise ValueError("Atom " + str(atom) + " has no coordinates") 

645 else: 

646 x_coord, y_coord, z_coord = [ 

647 "{:.3f}".format(coord).rjust(8) for coord in coords 

648 ] 

649 occupancy = "1.00" # Just a placeholder 

650 temp_factor = "0.00" # Just a placeholder 

651 element = atom.element 

652 atom_line = ( 

653 "ATOM " + str(index) + " " + str(name) + " " + residue_name + chain + residue_number + icode + " " + x_coord + y_coord + z_coord + " " + occupancy + " " + temp_factor + " " + element 

654 ).ljust(80) + "\n" 

655 file.write(atom_line) 

656 

657 # Get a chain by its name 

658 def get_chain_by_name(self, name: str) -> Optional[Chain]: 

659 return next((c for c in self.chains if c.name == name), None) 

660 

661 # Get a summary of the structure 

662 def display_summary(self): 

663 print("Atoms: " + str(len(self.atoms))) 

664 print("Residues: " + str(len(self.residues))) 

665 print("Chains: " + str(len(self.chains))) 

666 for chain in self.chains: 

667 print( 

668 "Chain " + str(chain.name) + " (" + str(len(chain.residue_indices)) + " residues)" 

669 ) 

670 print(" -> " + chain.get_sequence()) 

671 

672 # This is an alternative system to find protein chains(anything else is chained as 'X') 

673 # This system does not depend on VMD 

674 # It totally overrides previous chains since it is expected to be used only when chains are missing 

675 def raw_protein_chainer(self): 

676 current_chain = self.get_next_available_chain_name() 

677 previous_alpha_carbon = None 

678 for residue in self.residues: 

679 alpha_carbon = next( 

680 (atom for atom in residue.atoms if atom.name == "CA"), None 

681 ) 

682 if not alpha_carbon: 

683 residue.set_chain("X") 

684 continue 

685 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms 

686 residues_are_connected = ( 

687 previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4 

688 ) 

689 if not residues_are_connected: 

690 current_chain = self.get_next_available_chain_name() 

691 if current_chain: 

692 residue.set_chain(current_chain) 

693 previous_alpha_carbon = alpha_carbon 

694 

695 # Get the next available chain name 

696 # Find alphabetically the first letter which is not yet used as a chain name 

697 # If all letters in the alphabet are used already then return None 

698 def get_next_available_chain_name(self) -> Optional[str]: 

699 current_chain_names = [chain.name for chain in self.chains] 

700 return next( 

701 (name for name in available_caps if name not in current_chain_names), None 

702 ) 

703 

704 

705# Calculate the distance between two atoms 

706def calculate_distance(atom_1: Atom, atom_2: Atom) -> float: 

707 squared_distances_sum = 0.0 

708 if atom_1.coords is not None and atom_2.coords is not None: 

709 for i in {"x": 0, "y": 1, "z": 2}.values(): 

710 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i]) ** 2 

711 return math.sqrt(squared_distances_sum) 

712 

713 

714# Set all available chains according to pdb standards 

715available_caps = [ 

716 "A", 

717 "B", 

718 "C", 

719 "D", 

720 "E", 

721 "F", 

722 "G", 

723 "H", 

724 "I", 

725 "J", 

726 "K", 

727 "L", 

728 "M", 

729 "N", 

730 "O", 

731 "P", 

732 "Q", 

733 "R", 

734 "S", 

735 "T", 

736 "U", 

737 "V", 

738 "W", 

739 "X", 

740 "Y", 

741 "Z", 

742] 

743 

744# Protein residues 

745aminoacids = { 

746 "ALA": "A", 

747 "ALAN": "A", 

748 "ALAC": "A", 

749 "ARG": "R", 

750 "ARGN": "R", 

751 "ARGC": "R", 

752 "ASN": "N", 

753 "ASNN": "N", 

754 "ASNC": "N", 

755 "ASP": "D", 

756 "ASPN": "D", 

757 "ASPC": "D", 

758 "CYS": "C", 

759 "CYSN": "C", 

760 "CYSC": "C", 

761 "CYH": "C", 

762 "CSH": "C", 

763 "CSS": "C", 

764 "CYX": "C", 

765 "CYP": "C", 

766 "GLN": "Q", 

767 "GLNN": "Q", 

768 "GLNC": "Q", 

769 "GLU": "E", 

770 "GLUN": "E", 

771 "GLUC": "E", 

772 "GLY": "G", 

773 "GLYN": "G", 

774 "GLYC": "G", 

775 "HIS": "H", 

776 "HISN": "H", 

777 "HISC": "H", 

778 "HID": "H", 

779 "HIE": "H", 

780 "HIP": "H", 

781 "HSD": "H", 

782 "HSE": "H", 

783 "ILE": "I", 

784 "ILEN": "I", 

785 "ILEC": "I", 

786 "ILU": "I", 

787 "LEU": "L", 

788 "LEUN": "L", 

789 "LEUC": "L", 

790 "LYS": "K", 

791 "LYSN": "K", 

792 "LYSC": "K", 

793 "MET": "M", 

794 "METN": "M", 

795 "METC": "M", 

796 "PHE": "F", 

797 "PHEN": "F", 

798 "PHEC": "F", 

799 "PRO": "P", 

800 "PRON": "P", 

801 "PROC": "P", 

802 "PRØ": "P", 

803 "PR0": "P", 

804 "PRZ": "P", 

805 "SER": "S", 

806 "SERN": "S", 

807 "SERC": "S", 

808 "THR": "T", 

809 "THRN": "T", 

810 "THRC": "R", 

811 "TRP": "W", 

812 "TRPN": "W", 

813 "TRPC": "W", 

814 "TRY": "W", 

815 "TYR": "Y", 

816 "TYRN": "Y", 

817 "TYRC": "Y", 

818 "VAL": "V", 

819 "VALN": "V", 

820 "VALC": "V", 

821} 

822 

823# Nucleic acid residues 

824nucleotides = { 

825 "A": "A", 

826 "A3": "A", 

827 "A5": "A", 

828 "C": "C", 

829 "C3": "C", 

830 "C5": "C", 

831 "T": "T", 

832 "T3": "T", 

833 "T5": "T", 

834 "G": "G", 

835 "G3": "G", 

836 "G5": "G", 

837 "U": "U", 

838 "U3": "U", 

839 "U5": "U", 

840 "DA": "A", 

841 "DT": "T", 

842 "DC": "C", 

843 "DG": "G", 

844} 

845 

846# All supported residues 

847supported_residues = {**aminoacids, **nucleotides} 

848 

849 

850# Transform a residue name to its equivalent single letter code 

851# If the residue name is not recognized then return "X" 

852# e.g. "ARG" -> "R", "WTF" -> "X" 

853# You can choose which residue types are targeted(e.g. aminoacids only) 

854# Options are: 'all', 'aminoacids' or 'nucleotides' 

855# All residue types are targeted by default 

856def residue_name_2_letter(residue_name: str, residue_types: str = "all") -> str: 

857 # Set the target residues 

858 if residue_types == "all": 

859 target_residues = supported_residues 

860 elif residue_types == "aminoacids": 

861 target_residues = aminoacids 

862 elif residue_types == "nucleotides": 

863 target_residues = nucleotides 

864 else: 

865 raise ValueError("Unrecognized residue types " + str(residue_types)) 

866 # Now find the corresponding letter among the target residues 

867 ref = target_residues.get(residue_name, "X") 

868 return ref 

869 

870 

871# Map the structure aminoacids sequences against the standard reference sequences 

872# References are uniprot accession ids and they are optional 

873# For each reference, align the reference sequence with the topology sequence 

874# Chains which do not match any reference sequence will be blasted 

875# Note that an internet connection is required both to retireve the uniprot reference sequence and to do the blast 

876# NEVER FORGET: This system relies on the fact that topology chains are not repeated 

877def generate_map_online( 

878 structure: "Structure", forced_references: list[str] = [] 

879) -> Optional[dict]: 

880 # Store all the references which are got through this process 

881 # Note that not all references may be used at the end 

882 references = {} 

883 # Get the structure chain sequences 

884 structure_sequences = get_chain_sequences(structure) 

885 # Find out which chains are protein 

886 protein_sequences = [] 

887 for structure_sequence in structure_sequences: 

888 sequence = structure_sequence["sequence"] 

889 if next((letter for letter in sequence if letter != "X"), None): 

890 structure_sequence["match"] = {"ref": None, "map": None, "score": 0} 

891 protein_sequences.append(structure_sequence) 

892 # For each input forced reference, get the reference sequence 

893 reference_sequences = {} 

894 if forced_references: 

895 # Check forced_references is not a string 

896 if isinstance(forced_references, str): 

897 raise TypeError( 

898 "Forced references should be a list and not a string a this point" 

899 ) 

900 for uniprot_id in forced_references: 

901 reference = get_uniprot_reference(uniprot_id) 

902 reference_sequences[reference["uniprot"]] = reference["sequence"] 

903 # Save the current whole reference object for later 

904 references[reference["uniprot"]] = reference 

905 

906 # Try to match all protein sequences with the available reference sequences 

907 # In case of match, objects in the 'protein_sequences' list are modified by adding the result 

908 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

909 def match_sequences() -> bool: 

910 # Track each chain-reference alignment match and keep the score of successful alignments 

911 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum) 

912 for structure_sequence in protein_sequences: 

913 for uniprot_id, reference_sequence in reference_sequences.items(): 

914 # Align the structure sequence with the reference sequence 

915 align_results = align( 

916 reference_sequence, structure_sequence["sequence"] 

917 ) 

918 if not align_results: 

919 continue 

920 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any) 

921 sequence_map, align_score = align_results 

922 current_reference = structure_sequence["match"] 

923 if current_reference["score"] > align_score: 

924 continue 

925 reference = references[uniprot_id] 

926 # If the alignment is better then we impose the new reference 

927 structure_sequence["match"] = { 

928 "ref": reference, 

929 "map": sequence_map, 

930 "score": align_score, 

931 } 

932 # Sum up the current matching 

933 print("Reference summary:") 

934 for structure_sequence in structure_sequences: 

935 name = structure_sequence["name"] 

936 match = structure_sequence.get("match", None) 

937 if not match: 

938 print(" " + name + " -> Not protein") 

939 continue 

940 reference = structure_sequence["match"].get("ref", None) 

941 if not reference: 

942 print(" " + name + " -> ¿?") 

943 continue 

944 uniprot_id = reference["uniprot"] 

945 print(" " + name + " -> " + uniprot_id) 

946 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

947 return all( 

948 [ 

949 structure_sequence["match"]["ref"] 

950 for structure_sequence in protein_sequences 

951 ] 

952 ) 

953 

954 # If we have every protein chain matched with a reference already then we stop here 

955 if match_sequences(): 

956 return format_topology_data(structure, protein_sequences) 

957 # If there are still any chain which is not matched with a reference then we need more references 

958 # To get them, we run a blast with each orphan chain sequence 

959 for structure_sequence in protein_sequences: 

960 # Skip already references chains 

961 if structure_sequence["match"]["ref"]: 

962 continue 

963 # Run the blast 

964 sequence = structure_sequence["sequence"] 

965 # uniprot_id = blast(sequence)[0] 

966 uniprot_id = blast(sequence) # type: ignore 

967 # Build a new reference from the resulting uniprot 

968 reference = get_uniprot_reference(uniprot_id) # type: ignore 

969 reference_sequences[reference["uniprot"]] = reference["sequence"] 

970 # Save the current whole reference object for later 

971 references[reference["uniprot"]] = reference 

972 # If we have every protein chain matched with a reference already then we stop here 

973 if match_sequences(): 

974 return format_topology_data(structure, protein_sequences) 

975 print( 

976 "The BLAST failed to find a matching reference sequence for at least one protein sequence" 

977 ) 

978 return None 

979 

980 

981# Try to match all protein sequences with the available reference sequences 

982# In case of match, objects in the 'protein_sequences' list are modified by adding the result 

983# Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

984# def match_sequences(protein_sequences: list, reference_sequences: dict) -> bool: 

985# # Track each chain-reference alignment match and keep the score of successful alignments 

986# # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum) 

987# for structure_sequence in protein_sequences: 

988# for uniprot_id, reference_sequence in reference_sequences.items(): 

989# # Align the structure sequence with the reference sequence 

990# align_results = align(reference_sequence, structure_sequence['sequence']) 

991# if not align_results: 

992# continue 

993# # In case we have a valid alignment, check the alignment score is better than the current reference score(if any) 

994# sequence_map, align_score = align_results 

995# current_reference = structure_sequence['match'] 

996# if current_reference['score'] > align_score: 

997# continue 

998# reference = references[uniprot_id] # type: ignore 

999 

1000# # If the alignment is better then we impose the new reference 

1001# structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score} 

1002# # Finally, return True if all protein sequences were matched with the available reference sequences or False if not 

1003# return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences]) 

1004 

1005 

1006# Reformat mapping data to the topology system(introduced later) 

1007def format_topology_data(structure: "Structure", mapping_data: list) -> dict: 

1008 # Get the count of residues from the structure 

1009 residues_count = len(structure.residues) 

1010 # Now format data 

1011 reference_ids = [] 

1012 residue_reference_indices: list = [None] * residues_count 

1013 residue_reference_numbers = [None] * residues_count 

1014 for data in mapping_data: 

1015 match = data["match"] 

1016 # Get the reference index 

1017 # Note that several matches may belong to the same reference and thus have the same index 

1018 reference = match["ref"] 

1019 uniprot_id = reference["uniprot"] 

1020 if uniprot_id not in reference_ids: 

1021 reference_ids.append(reference["uniprot"]) 

1022 reference_index = reference_ids.index(uniprot_id) 

1023 residue_indices = data["residue_indices"] 

1024 for r, residue_number in enumerate(match["map"]): 

1025 if residue_number is None: 

1026 continue 

1027 residue_index = residue_indices[r] 

1028 residue_reference_indices[residue_index] = reference_index 

1029 residue_reference_numbers[residue_index] = residue_number 

1030 # If there are not references at the end then set all fields as None, in order to save space 

1031 if len(reference_ids) == 0: 

1032 reference_ids = [] 

1033 residue_reference_indices = [] 

1034 residue_reference_numbers = [] 

1035 # Return the 3 topology fields as they are in the database 

1036 residues_map = { 

1037 "references": reference_ids, 

1038 "residue_reference_indices": residue_reference_indices, 

1039 "residue_reference_numbers": residue_reference_numbers, 

1040 } 

1041 return residues_map 

1042 

1043 

1044# Align a reference aminoacid sequence with each chain sequence in a topology 

1045# NEVER FORGET: This system relies on the fact that topology chains are not repeated 

1046def map_sequence(ref_sequence: str, structure: "Structure") -> list: 

1047 sequences = get_chain_sequences(structure) 

1048 mapping: list = [] 

1049 for s in sequences: 

1050 sequence = sequences[s] 

1051 sequence_map = align(ref_sequence, sequence) 

1052 if sequence_map: 

1053 mapping += sequence_map 

1054 return mapping 

1055 

1056 

1057# Get each chain name and aminoacids sequence in a topology 

1058# Output format example: [{'sequence': 'VNLTT', 'indices': [1, 2, 3, 4, 5]}, ...] 

1059def get_chain_sequences(structure: "Structure") -> list: 

1060 sequences = [] 

1061 chains = structure.chains 

1062 for chain in chains: 

1063 name = chain.name 

1064 sequence = "" 

1065 residue_indices = [] 

1066 for residue in chain.residues: 

1067 letter = residue_name_2_letter(residue.name, "aminoacids") 

1068 sequence += letter 

1069 residue_indices.append(residue.index) 

1070 # Save sequences by chain name (i.e. chain id or chain letter) 

1071 sequence_object = { 

1072 "name": name, 

1073 "sequence": sequence, 

1074 "residue_indices": residue_indices, 

1075 } 

1076 sequences.append(sequence_object) 

1077 return sequences 

1078 

1079 

1080# Align two aminoacid sequences 

1081# Return a list with the reference residue indexes (values) 

1082# which match each new sequence residues indexes (indexes) 

1083# Return also the score of the alignment 

1084# Return None when there is not valid alignment at all 

1085def align(ref_sequence: str, new_sequence: str) -> Optional[tuple[list, float]]: 

1086 # print('- REFERENCE\n' + ref_sequence + '\n- NEW\n' + new_sequence) 

1087 

1088 # If the new sequence is all 'X' stop here, since this would make the alignment infinite 

1089 # Then an array filled with None is returned 

1090 if all([letter == "X" for letter in new_sequence]): 

1091 return None 

1092 

1093 # Return the new sequence as best aligned as possible with the reference sequence 

1094 alignments = pairwise2.align.localds( # type: ignore 

1095 ref_sequence, new_sequence, blosum62, -10, -0.5 

1096 ) 

1097 # DANI: Habría que hacerlo de esta otra forma según el deprecation warning (arriba hay más código) 

1098 # DANI: El problema es que el output lo tiene todo menos la sequencia en formato alienada 

1099 # DANI: i.e. formato '----VNLTT', que es justo el que necesito 

1100 # alignments = aligner.align(ref_sequence, new_sequence) 

1101 

1102 # In case there are no alignments it means the current chain has nothing to do with this reference 

1103 # Then an array filled with None is returned 

1104 if len(alignments) == 0: 

1105 return None 

1106 

1107 # Several alignments may be returned, specially when it is a difficult or impossible alignment 

1108 # Output format example: '----VNLTT' 

1109 best_alignment = alignments[0] 

1110 aligned_sequence = best_alignment[1] 

1111 print(format_alignment(*alignments[0])) 

1112 score = alignments[0][2] 

1113 # WARNING: Do not use 'aligned_sequence' length here since it has the total sequence length 

1114 normalized_score = score / len(new_sequence) 

1115 print("Normalized score: " + str(normalized_score)) 

1116 

1117 # If the normalized score does not reaches the minimum we consider the alignment is not valid 

1118 # It may happen when the reference goes for a specific chain but we must map all chains 

1119 # This 1 has been found experimentally 

1120 # Non maching sequence may return a 0.1-0.3 normalized score 

1121 # Matching sequence may return >4 normalized score 

1122 if normalized_score < 1: 

1123 print("Not valid alignment") 

1124 return None 

1125 

1126 # Match each residue 

1127 aligned_mapping: list = [] 

1128 aligned_index = 0 

1129 for l_index, letter in enumerate(aligned_sequence): 

1130 # Guions are skipped 

1131 if letter == "-": 

1132 continue 

1133 # Get the current residue of the new sequence 

1134 equivalent_letter = new_sequence[aligned_index] 

1135 if not letter == equivalent_letter: 

1136 raise SystemExit("Something was wrong:S") 

1137 # 'X' residues cannot be mapped since reference sequences should never have any 'X' 

1138 if letter == "X": 

1139 aligned_mapping.append(None) 

1140 aligned_index += 1 

1141 continue 

1142 # Otherwise add the equivalent aligned index to the mapping 

1143 # WARNING: Add +1 since uniprot residue counts start at 1, not 0 

1144 aligned_mapping.append(l_index + 1) 

1145 aligned_index += 1 

1146 

1147 return aligned_mapping, normalized_score 

1148 

1149 

1150# Given an aminoacids sequence, return a list of uniprot ids 

1151# Note that we are blasting against UniProtKB / Swiss-Prot so results will always be valid UniProt accessions 

1152# WARNING: This always means results will correspond to curated entries only 

1153# If your sequence is from an exotic organism the result may be not from it but from other more studied organism 

1154def blast(sequence: str) -> list[str]: 

1155 print("Throwing blast... (this may take a few minutes)") 

1156 result = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence) 

1157 parsed_result = xmltodict.parse(result.read()) 

1158 hits = parsed_result["BlastOutput"]["BlastOutput_iterations"]["Iteration"][ 

1159 "Iteration_hits" 

1160 ]["Hit"] 

1161 # Get the first result only 

1162 result = hits[0] 

1163 # Return the accession 

1164 # DANI: Si algun día tienes problemas porque te falta el '.1' al final del accession puedes sacarlo de Hit_id 

1165 accession = result["Hit_accession"] 

1166 print("Result: " + accession) 

1167 print(result["Hit_def"]) 

1168 return accession 

1169 

1170 

1171# Given a uniprot accession, use the uniprot API to request its data and then mine what is needed for the database 

1172def get_uniprot_reference(uniprot_accession: str) -> dict: 

1173 # Request Uniprot 

1174 request_url = "https://www.ebi.ac.uk/proteins/api/proteins/" + uniprot_accession 

1175 try: 

1176 with urllib.request.urlopen(request_url) as response: 

1177 parsed_response = json.loads(response.read().decode("utf-8")) 

1178 # If the accession is not found in UniProt then the id is not valid 

1179 except urllib.error.HTTPError as error: # type: ignore 

1180 if error.code == 400: 

1181 raise ValueError( 

1182 "Something went wrong with the Uniprot request: " + request_url 

1183 ) 

1184 # Get the aminoacids sequence 

1185 sequence = parsed_response["sequence"]["sequence"] 

1186 return {"uniprot": uniprot_accession, "sequence": sequence}