Coverage for biobb_model/model/fix_pdb

1import os

2import urllib.request

3import json

4import math

5from bisect import bisect

6from Bio import pairwise2

7from Bio.pairwise2 import format_alignment

8from Bio.Blast import NCBIWWW

9import xmltodict

10from typing import List, Tuple, Optional, Union

11Coords = Tuple[float, float, float]

13try:

14 from Bio.SubsMat import MatrixInfo

15 blosum62 = MatrixInfo.blosum62

16except ImportError:

17 from Bio.Align import substitution_matrices

18 blosum62 = substitution_matrices.load("BLOSUM62")

21# An atom

22class Atom:

23 def __init__(self, name: Optional[str] = None, element: Optional[str] = None, coords: Optional[Coords] = None):

24 self.name = name

25 self.element = element

26 self.coords = coords

27 # Set variables to store references to other related instances

28 # These variables will be set further by the structure

29 self._structure = None

30 self._index = None

31 self._residue_index = None

33 def __repr__(self):

34 return '<Atom ' + self.name + '>'

36 def __eq__(self, other):

37 return self._residue_index == other._residue_index and self.name == other.name

39 # The parent structure (read only)

40 # This value is set by the structure itself

41 def get_structure(self):

42 return self._structure

43 structure = property(get_structure, None, None, "The parent structure (read only)")

45 # The residue index according to parent structure residues (read only)

46 # This value is set by the structure itself

47 def get_index(self):

48 return self._index

49 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")

51 # The atom residue index according to parent structure residues

52 # If residue index is set then make changes in all the structure to make this change coherent

53 def get_residue_index(self) -> int:

54 return self._residue_index

56 def set_residue_index(self, new_residue_index: int):

57 # If there is not strucutre yet it means the residue is beeing set before the structure

58 # We just save the residue index and wait for the structure to be set

59 if not self.structure:

60 self._residue_index = new_residue_index

61 return

62 # Relational indices are updated through a top-down hierarchy

63 # Affected residues are the ones to update this atom internal residue index

64 current_residue = self.residue

65 current_residue.remove_atom(self)

66 new_residue = self.structure.residues[new_residue_index]

67 new_residue.add_atom(self)

68 residue_index = property(get_residue_index, set_residue_index, None, "The atom residue index according to parent structure residues")

70 # The atom residue

71 # If residue is set then make changes in all the structure to make this change coherent

72 def get_residue(self) -> 'Residue':

73 # If there is not strucutre yet it means the atom is beeing set before the structure

74 # In this case it is not possible to get related residues in the structure

75 if not self.structure:

76 return None

77 # Get the residue in the structure according to the residue index

78 return self.structure.residues[self.residue_index]

80 def set_residue(self, new_residue: 'Residue'):

81 # Find the new residue index and set it as the atom residue index

82 # Note that the residue must be set in the structure already

83 new_residue_index = new_residue.index

84 if new_residue_index is None:

85 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')

86 self.set_residue_index(new_residue_index)

87 residue = property(get_residue, set_residue, None, "The atom residue")

89 # The atom chain index according to parent structure chains (read only)

90 # In order to change the chain index it must be changed in the atom residue

91 def get_chain_index(self) -> int:

92 return self.residue.chain_index

93 chain_index = property(get_chain_index, None, None, "The atom chain index according to parent structure chains (read only)")

95 # The atom chain(read only)

96 # In order to change the chain it must be changed in the atom residue

97 def get_chain(self) -> 'Chain':

98 # Get the chain in the structure according to the chain index

99 return self.structure.chains[self.chain_index]

100 chain = property(get_chain, None, None, "The atom chain (read only)")

101

102

103# A residue

104class Residue:

105 def __init__(self, name: Optional[str] = None, number: Optional[int] = None, icode: Optional[str] = None):

106 self.name = name

107 self.number = number

108 self.icode = icode

109 # Set variables to store references to other related instaces

110 # These variables will be set further by the structure

111 self._structure = None

112 self._index = None

113 self._atom_indices = []

114 self._chain_index = None

115

116 def __repr__(self):

117 return '<Residue ' + self.name + str(self.number) + (self.icode if self.icode else '') + '>'

118

119 def __eq__(self, other):

120 if type(self) != type(other):

121 return False

122 return (self._chain_index == other._chain_index and self.number == other.number and self.icode == other.icode)

123

124 def __hash__(self):

125 # WARNING: This is susceptible to duplicated residues

126 return hash((self._chain_index, self.number, self.icode))

127 # WARNING: This is not susceptible to duplicated residues

128 # return hash(tuple(self._atom_indices))

129

130 def same_inputs_as(self, other) -> bool:

131 return (

132 self.name == other.name and

133 self.number == other.number and

134 self.icode == other.icode

135 )

136

137 # The parent structure (read only)

138 # This value is set by the structure itself

139 def get_structure(self):

140 return self._structure

141 structure = property(get_structure, None, None, "The parent structure (read only)")

142

143 # The residue index according to parent structure residues (read only)

144 # This value is set by the structure itself

145 def get_index(self):

146 return self._index

147 index = property(get_index, None, None, "The residue index according to parent structure residues(read only)")

148

149 # The atom indices according to parent structure atoms for atoms in this residue

150 # If atom indices are set then make changes in all the structure to make this change coherent

151 def get_atom_indices(self) -> List[int]:

152 return self._atom_indices

153

154 def set_atom_indices(self, new_atom_indices: List[int]):

155 # If there is not strucutre yet it means the residue is beeing set before the structure

156 # We just save atom indices and wait for the structure to be set

157 if not self.structure:

158 self._atom_indices = new_atom_indices

159 return

160 # Update the current atoms

161 for atom in self.atoms:

162 atom._residue_index = None

163 # Update the new atoms

164 for index in new_atom_indices:

165 atom = self.structure.atoms[index]

166 atom._residue_index = self.index

167 # Now new indices are coherent and thus we can save them

168 self._atom_indices = new_atom_indices

169 atom_indices = property(get_atom_indices, set_atom_indices, None, "The atom indices according to parent structure atoms for atoms in this residue")

170

171 # The atoms in this residue

172 # If atoms are set then make changes in all the structure to make this change coherent

173 def get_atoms(self) -> List['Atom']:

174 # If there is not strucutre yet it means the chain is beeing set before the structure

175 # In this case it is not possible to get related atoms in the structure

176 if not self.structure:

177 return []

178 # Get atoms in the structure according to atom indices

179 atoms = self.structure.atoms

180 return [atoms[atom_index] for atom_index in self.atom_indices]

181

182 def set_atoms(self, new_atoms: List['Atom']):

183 # Find indices for new atoms and set their indices as the new atom indices

184 # Note that atoms must be set in the structure already

185 new_atom_indices = []

186 for new_atom in new_atoms:

187 new_atom_index = new_atom.index

188 if new_atom_index is None:

189 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')

190 new_atom_indices.append(new_atom_index)

191 self.set_atom_indices(new_atom_indices)

192 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue")

193

194 # Add an atom to the residue

195 def add_atom(self, new_atom: 'Atom'):

196 # Insert the new atom index in the list of atom indices keeping the order

197 new_atom_index = new_atom.index

198 sorted_atom_index = bisect(self.atom_indices, new_atom_index)

199 self.atom_indices.insert(sorted_atom_index, new_atom_index)

200 # Update the atom internal index

201 new_atom._residue_index = self.index

202

203 # Remove an atom from the residue

204 def remove_atom(self, current_atom: 'Atom'):

205 # Remove the current atom index from the atom indices list

206 self.atom_indices.remove(current_atom.index) # This index MUST be in the list

207 # Update the atom internal index

208 current_atom._residue_index = None

209

210 # The residue chain index according to parent structure chains

211 # If chain index is set then make changes in all the structure to make this change coherent

212 def get_chain_index(self) -> int:

213 return self._chain_index

214

215 def set_chain_index(self, new_chain_index: int):

216 # If there is not strucutre yet it means the chain is beeing set before the structure

217 # We just save the chain index and wait for the structure to be set

218 if not self.structure:

219 self._chain_index = new_chain_index

220 return

221 # Relational indices are updated through a top-down hierarchy

222 # Affected chains are the ones to update this residue internal chain index

223 # WARNING: It is critical to find both current and new chains before removing/adding residues

224 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged

225 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed

226 current_chain = self.chain

227 new_chain = self.structure.chains[new_chain_index]

228 current_chain.remove_residue(self)

229 new_chain.add_residue(self)

230 chain_index = property(get_chain_index, set_chain_index, None, "The residue chain index according to parent structure chains")

231

232 # The residue chain

233 # If chain is set then make changes in all the structure to make this change coherent

234 def get_chain(self) -> 'Chain':

235 # If there is not strucutre yet it means the residue is beeing set before the structure

236 # In this case it is not possible to get related chain in the structure

237 if not self.structure:

238 return []

239 # Get the chain in the structure according to the chain index

240 return self.structure.chains[self.chain_index]

241

242 def set_chain(self, new_chain: Union['Chain', str]):

243 # In case the chain is just a string we must find/create the corresponding chain

244 if type(new_chain) == str:

245 letter = new_chain

246 # Get the residue structure

247 structure = self.structure

248 if not structure:

249 raise ValueError('Cannot find the corresponding ' + new_chain + ' chain without the structure')

250 # Find if the letter belongs to an already existing chain

251 new_chain = structure.get_chain_by_name(letter)

252 # If the chain does not exist yet then create it

253 if not new_chain:

254 new_chain = Chain(name=letter)

255 structure.set_new_chain(new_chain)

256 # Find the new chain index and set it as the residue chain index

257 # Note that the chain must be set in the structure already

258 new_chain_index = new_chain.index

259 if new_chain_index is None:

260 raise ValueError('Chain ' + str(new_chain) + ' is not set in the structure')

261 self.set_chain_index(new_chain_index)

262 chain = property(get_chain, set_chain, None, "The residue chain")

263

264

265# A chain

266class Chain:

267 def __init__(self, name: Optional[str] = None):

268 self.name = name

269 # Set variables to store references to other related instaces

270 # These variables will be set further by the structure

271 self._structure = None

272 self._index = None

273 self.residue_indices = []

274

275 def __repr__(self):

276 return '<Chain ' + self.name + '>'

277

278 def __eq__(self, other):

279 return self.name == other.name

280

281 # The parent structure(read only)

282 # This value is set by the structure itself

283 def get_structure(self):

284 return self._structure

285 structure = property(get_structure, None, None, "The parent structure(read only)")

286

287 # The residue index according to parent structure residues(read only)

288 # This value is set by the structure itself

289 def get_index(self):

290 return self._index

291

292 # When the index is set all residues are updated with the nex chain index

293 def set_index(self, index: int):

294 for residue in self.residues:

295 residue._chain_index = index

296 self._index = index

297 index = property(get_index, set_index, None, "The residue index according to parent structure residues(read only)")

298

299 # The residue indices according to parent structure residues for residues in this chain

300 # If residue indices are set then make changes in all the structure to make this change coherent

301 def get_residue_indices(self) -> List[int]:

302 return self._residue_indices

303

304 def set_residue_indices(self, new_residue_indices: List[int]):

305 # If there is not strucutre yet it means the chain is beeing set before the structure

306 # We just save residue indices and wait for the structure to be set

307 if not self.structure:

308 self._residue_indices = new_residue_indices

309 return

310 # Update the current residues

311 for residue in self.residues:

312 residue._chain_index = None

313 # Update the new residues

314 for index in new_residue_indices:

315 residue = self.structure.residues[index]

316 residue._chain_index = self.index

317 # In case the new residue indices list is empty this chain must be removed from its structure

318 if len(new_residue_indices) == 0:

319 self.structure.purge_chain(self)

320 # Now new indices are coherent and thus we can save them

321 self._residue_indices = new_residue_indices

322 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")

323

324 # The residues in this chain

325 # If residues are set then make changes in all the structure to make this change coherent

326 def get_residues(self) -> List['Residue']:

327 # If there is not strucutre yet it means the chain is beeing set before the structure

328 # In this case it is not possible to get related residues in the structure

329 if not self.structure:

330 return []

331 # Get residues in the structure according to residue indices

332 residues = self.structure.residues

333 return [residues[residue_index] for residue_index in self.residue_indices]

334

335 def set_residues(self, new_residues: List['Residue']):

336 # Find indices for new residues and set their indices as the new residue indices

337 # Note that residues must be set in the structure already

338 new_residue_indices = []

339 for new_residue in new_residues:

340 new_residue_index = new_residue.index

341 if new_residue_index is None:

342 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')

343 new_residue_indices.append(new_residue_index)

344 self.set_residue_indices(new_residue_indices)

345 residues = property(get_residues, set_residues, None, "The residues in this chain")

346

347 # Add a residue to the chain

348 def add_residue(self, residue: 'Residue'):

349 # Insert the new residue index in the list of residue indices keeping the order

350 sorted_residue_index = bisect(self.residue_indices, residue.index)

351 self.residue_indices.insert(sorted_residue_index, residue.index)

352 # Update the residue internal chain index

353 residue._chain_index = self.index

354

355 # Remove a residue from the chain

356 # WARNING: Note that this function does not trigger the set_residue_indices

357 def remove_residue(self, residue: 'Residue'):

358 self.residue_indices.remove(residue.index) # This index MUST be in the list

359 # If we removed the last index then this chain must be removed from its structure

360 if len(self.residue_indices) == 0 and self.structure:

361 self.structure.purge_chain(self)

362 # Update the residue internal chain index

363 residue._chain_index = None

364

365 # Atom indices for all atoms in the chain(read only)

366 # In order to change atom indices they must be changed in their corresponding residues

367 def get_atom_indices(self) -> List[int]:

368 return sum([residue.atom_indices for residue in self.residues], [])

369 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain(read only)")

370

371 # Atoms in the chain(read only)

372 # In order to change atoms they must be changed in their corresponding residues

373 def get_atoms(self) -> List[int]:

374 return sum([residue.atoms for residue in self.residues], [])

375 atoms = property(get_atoms, None, None, "Atoms in the chain(read only)")

376

377 # Get the residues sequence in one-letter code

378 def get_sequence(self) -> str:

379 return ''.join([residue_name_2_letter(residue.name) for residue in self.residues])

380

381

382# A structure is a group of atoms organized in chains and residues

383class Structure:

384 def __init__(self, atoms: List['Atom'] = [], residues: List['Residue'] = [], chains: List['Chain'] = []):

385 self.atoms = []

386 self.residues = []

387 self.chains = []

388 # Set references between instances

389 for atom in atoms:

390 self.set_new_atom(atom)

391 for residue in residues:

392 self.set_new_residue(residue)

393 for chain in chains:

394 self.set_new_chain(chain)

395

396 def __repr__(self):

397 return '<Structure(' + str(len(self.atoms)) + ' atoms)>'

398

399 # Set a new atom in the structure

400 def set_new_atom(self, atom: 'Atom'):

401 atom._structure = self

402 new_atom_index = len(self.atoms)

403 self.atoms.append(atom)

404 atom._index = new_atom_index

405

406 # Set a new residue in the structure

407 # WARNING: Atoms must be set already before setting residues

408 def set_new_residue(self, residue: 'Residue'):

409 residue._structure = self

410 new_residue_index = len(self.residues)

411 self.residues.append(residue)

412 residue._index = new_residue_index

413 # In case the residue has atom indices, set relational indices on each atom

414 for atom_index in residue.atom_indices:

415 atom = self.atoms[atom_index]

416 atom._residue_index = new_residue_index

417

418 # Set a new chain in the structure

419 # WARNING: Residues and atoms must be set already before setting chains

420 def set_new_chain(self, chain: 'Chain'):

421 chain._structure = self

422 new_chain_index = len(self.chains)

423 self.chains.append(chain)

424 chain._index = new_chain_index

425 # In case the chain has residue indices, set relational indices on each residue

426 for residue_index in chain.residue_indices:

427 residue = self.residues[residue_index]

428 residue._chain_index = new_chain_index

429

430 # Purge chain from the structure

431 # This can be done only when the chain has no residues left in the structure

432 # Renumerate all chain indices which have been offsetted as a result of the purge

433 def purge_chain(self, chain: 'Chain'):

434 # Check the chain can be purged

435 if chain not in self.chains:

436 raise ValueError('Chain ' + chain.name + ' is not in the structure already')

437 if len(chain.residue_indices) > 0:

438 raise ValueError('Chain ' + chain.name + ' is still having residues and thus it cannot be purged')

439 # Get the current index of the chain to be purged

440 purged_index = chain.index

441 # Chains and their residues below this index are not to be modified

442 # Chains and their residues over this index must be renumerated

443 for affected_chain in self.chains[purged_index+1:]:

444 # Chaging the index automatically changes all chain residues '_chain_index' values.

445 affected_chain.index -= 1

446 # Finally, remove the current chain from the list of chains in the structure

447 self.chains.remove(chain)

448

449 # Set the structure from a pdb file

450 @classmethod

451 def from_pdb_file(cls, pdb_filename: str):

452 if not os.path.exists(pdb_filename):

453 raise SystemExit('File "' + pdb_filename + '" not found')

454 # Read the pdb file line by line and set the parsed atoms, residues and chains

455 parsed_atoms = []

456 parsed_residues = []

457 parsed_chains = []

458 atom_index = -1

459 residue_index = -1

460 with open(pdb_filename, 'r') as file:

461 for line in file:

462 # Parse atoms only

463 start = line[0:6]

464 is_atom = start == 'ATOM ' or start == 'HETATM'

465 if not is_atom:

466 continue

467 # Mine all atom data

468 atom_name = line[11:16].strip()

469 residue_name = line[17:21].strip()

470 chain = line[21:22]

471 residue_number = int(line[22:26])

472 icode = line[26:27]

473 if icode == ' ':

474 icode = ''

475 x_coord = float(line[30:38])

476 y_coord = float(line[38:46])

477 z_coord = float(line[46:54])

478 element = line[77:79].strip()

479 # Set the parsed atom, residue and chain

480 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))

481 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)

482 parsed_chain = Chain(name=chain)

483 # Add the parsed atom to the list and update the current atom index

484 parsed_atoms.append(parsed_atom)

485 atom_index += 1

486 # Check if we are in the same chain/residue than before

487 same_chain = parsed_chains and parsed_chains[-1] == parsed_chain

488 same_residue = same_chain and parsed_residues and parsed_residue.same_inputs_as(parsed_residues[-1])

489 # Update the residue atom indices

490 # If the residue equals the last parsed residue then use the previous instead

491 if same_residue:

492 parsed_residue = parsed_residues[-1]

493 parsed_residue.atom_indices.append(atom_index)

494 # If it is the same residue then it will be the same chain as well so we can proceed

495 continue

496 # Otherwise, include the new residue in the list and update the current residue index

497 parsed_residues.append(parsed_residue)

498 residue_index += 1

499 parsed_residue.atom_indices.append(atom_index)

500 # If the chain equals the last parsed chain then use the previous instead

501 if same_chain:

502 parsed_chain = parsed_chains[-1]

503 parsed_chain.residue_indices.append(residue_index)

504 continue

505 # Otherwise, include the new chain in the list

506 parsed_chains.append(parsed_chain)

507 parsed_chain.residue_indices.append(residue_index)

508 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)

509

510 # Fix atom elements by gueesing them when missing

511 # Set all elements with the first letter upper and the second(if any) lower

512 def fix_atom_elements(self):

513 for atom in self.atoms:

514 # Make sure elements have the first letter cap and the second letter not cap

515 if atom.element:

516 atom.element = first_cap_only(atom.element)

517 # If elements are missing then guess them from atom names

518 else:

519 atom.element = guess_name_element(atom.name)

520

521 # Generate a pdb file with current structure

522 def generate_pdb_file(self, pdb_filename: str):

523 with open(pdb_filename, "w") as file:

524 file.write('REMARK mdtoolbelt generated pdb file\n')

525 for a, atom in enumerate(self.atoms):

526 residue = atom.residue

527 index = str((a+1) % 100000).rjust(5)

528 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name

529 residue_name = residue.name.ljust(4)

530 chain = atom.chain.name.rjust(1)

531 residue_number = str(residue.number).rjust(4)

532 icode = residue.icode.rjust(1)

533 coords = atom.coords

534 x_coord, y_coord, z_coord = ["{:.3f}".format(coord).rjust(8) for coord in coords]

535 occupancy = '1.00' # Just a placeholder

536 temp_factor = '0.00' # Just a placeholder

537 element = atom.element

538 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name + chain + residue_number + icode + ' ' + x_coord + y_coord + z_coord + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'

539 file.write(atom_line)

540

541 # Get a chain by its name

542 def get_chain_by_name(self, name: str) -> 'Chain':

543 return next((c for c in self.chains if c.name == name), None)

544

545 # Get a summary of the structure

546 def display_summary(self):

547 print('Atoms: ' + str(len(self.atoms)))

548 print('Residues: ' + str(len(self.residues)))

549 print('Chains: ' + str(len(self.chains)))

550 for chain in self.chains:

551 print('Chain ' + chain.name + ' (' + str(len(chain.residue_indices)) + ' residues)')

552 print(' -> ' + chain.get_sequence())

553

554 # This is an alternative system to find protein chains(anything else is chained as 'X')

555 # This system does not depend on VMD

556 # It totally overrides previous chains since it is expected to be used only when chains are missing

557 def raw_protein_chainer(self):

558 current_chain = self.get_next_available_chain_name()

559 previous_alpha_carbon = None

560 for residue in self.residues:

561 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)

562 if not alpha_carbon:

563 residue.set_chain('X')

564 continue

565 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms

566 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4

567 if not residues_are_connected:

568 current_chain = self.get_next_available_chain_name()

569 residue.set_chain(current_chain)

570 previous_alpha_carbon = alpha_carbon

571

572 # Get the next available chain name

573 # Find alphabetically the first letter which is not yet used as a chain name

574 # If all letters in the alphabet are used already then return None

575 def get_next_available_chain_name(self) -> Optional[str]:

576 current_chain_names = [chain.name for chain in self.chains]

577 return next((name for name in available_caps if name not in current_chain_names), None)

578

579

580# Calculate the distance between two atoms

581def calculate_distance(atom_1: Atom, atom_2: Atom) -> float:

582 squared_distances_sum = 0

583 for i in {'x': 0, 'y': 1, 'z': 2}.values():

584 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2

585 return math.sqrt(squared_distances_sum)

586

587

588# Set all available chains according to pdb standards

589available_caps = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',

590 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

591

592# Protein residues

593aminoacids = {

594 "ALA": "A",

595 "ALAN": "A",

596 "ALAC": "A",

597 "ARG": "R",

598 "ARGN": "R",

599 "ARGC": "R",

600 "ASN": "N",

601 "ASNN": "N",

602 "ASNC": "N",

603 "ASP": "D",

604 "ASPN": "D",

605 "ASPC": "D",

606 "CYS": "C",

607 "CYSN": "C",

608 "CYSC": "C",

609 "CYH": "C",

610 "CSH": "C",

611 "CSS": "C",

612 "CYX": "C",

613 "CYP": "C",

614 "GLN": "Q",

615 "GLNN": "Q",

616 "GLNC": "Q",

617 "GLU": "E",

618 "GLUN": "E",

619 "GLUC": "E",

620 "GLY": "G",

621 "GLYN": "G",

622 "GLYC": "G",

623 "HIS": "H",

624 "HISN": "H",

625 "HISC": "H",

626 "HID": "H",

627 "HIE": "H",

628 "HIP": "H",

629 "HSD": "H",

630 "HSE": "H",

631 "ILE": "I",

632 "ILEN": "I",

633 "ILEC": "I",

634 "ILU": "I",

635 "LEU": "L",

636 "LEUN": "L",

637 "LEUC": "L",

638 "LYS": "K",

639 "LYSN": "K",

640 "LYSC": "K",

641 "MET": "M",

642 "METN": "M",

643 "METC": "M",

644 "PHE": "F",

645 "PHEN": "F",

646 "PHEC": "F",

647 "PRO": "P",

648 "PRON": "P",

649 "PROC": "P",

650 "PRØ": "P",

651 "PR0": "P",

652 "PRZ": "P",

653 "SER": "S",

654 "SERN": "S",

655 "SERC": "S",

656 "THR": "T",

657 "THRN": "T",

658 "THRC": "R",

659 "TRP": "W",

660 "TRPN": "W",

661 "TRPC": "W",

662 "TRY": "W",

663 "TYR": "Y",

664 "TYRN": "Y",

665 "TYRC": "Y",

666 "VAL": "V",

667 "VALN": "V",

668 "VALC": "V"

669}

670

671# Nucleic acid residues

672nucleotides = {

673 "A": "A",

674 "A3": "A",

675 "A5": "A",

676 "C": "C",

677 "C3": "C",

678 "C5": "C",

679 "T": "T",

680 "T3": "T",

681 "T5": "T",

682 "G": "G",

683 "G3": "G",

684 "G5": "G",

685 "U": "U",

686 "U3": "U",

687 "U5": "U",

688 "DA": "A",

689 "DT": "T",

690 "DC": "C",

691 "DG": "G",

692}

693

694# All supported residues

695supported_residues = {**aminoacids, **nucleotides}

696

697

698# Transform a residue name to its equivalent single letter code

699# If the residue name is not recognized then return "X"

700# e.g. "ARG" -> "R", "WTF" -> "X"

701# You can choose which residue types are targeted(e.g. aminoacids only)

702# Options are: 'all', 'aminoacids' or 'nucleotides'

703# All residue types are targeted by default

704def residue_name_2_letter(residue_name: str, residue_types: str = "all") -> str:

705 # Set the target residues

706 if residue_types == "all":

707 target_residues = supported_residues

708 elif residue_types == "aminoacids":

709 target_residues = aminoacids

710 elif residue_types == "nucleotides":

711 target_residues = nucleotides

712 else:

713 raise ValueError('Unrecognized residue types ' + str(residue_types))

714 # Now find the corresponding letter among the target residues

715 ref = target_residues.get(residue_name, False)

716 return ref if ref else "X"

717

718

719# Map the structure aminoacids sequences against the standard reference sequences

720# References are uniprot accession ids and they are optional

721# For each reference, align the reference sequence with the topology sequence

722# Chains which do not match any reference sequence will be blasted

723# Note that an internet connection is required both to retireve the uniprot reference sequence and to do the blast

724# NEVER FORGET: This system relies on the fact that topology chains are not repeated

725def generate_map_online(structure: 'Structure', forced_references: List[str] = []) -> Optional[dict]:

726 # Store all the references which are got through this process

727 # Note that not all references may be used at the end

728 references = {}

729 # Get the structure chain sequences

730 structure_sequences = get_chain_sequences(structure)

731 # Find out which chains are protein

732 protein_sequences = []

733 for structure_sequence in structure_sequences:

734 sequence = structure_sequence['sequence']

735 if next((letter for letter in sequence if letter != 'X'), None):

736 structure_sequence['match'] = {'ref': None, 'map': None, 'score': 0}

737 protein_sequences.append(structure_sequence)

738 # For each input forced reference, get the reference sequence

739 reference_sequences = {}

740 if forced_references:

741 # Check forced_references is not a string

742 if type(forced_references) == str:

743 raise TypeError('Forced references should be a list and not a string a this point')

744 for uniprot_id in forced_references:

745 reference = get_uniprot_reference(uniprot_id)

746 reference_sequences[reference['uniprot']] = reference['sequence']

747 # Save the current whole reference object for later

748 references[reference['uniprot']] = reference

749

750 # Try to match all protein sequences with the available reference sequences

751 # In case of match, objects in the 'protein_sequences' list are modified by adding the result

752 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not

753 def match_sequences() -> bool:

754 # Track each chain-reference alignment match and keep the score of successful alignments

755 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)

756 for structure_sequence in protein_sequences:

757 for uniprot_id, reference_sequence in reference_sequences.items():

758 # Align the structure sequence with the reference sequence

759 align_results = align(reference_sequence, structure_sequence['sequence'])

760 if not align_results:

761 continue

762 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)

763 sequence_map, align_score = align_results

764 current_reference = structure_sequence['match']

765 if current_reference['score'] > align_score:

766 continue

767 reference = references[uniprot_id]

768 # If the alignment is better then we impose the new reference

769 structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score}

770 # Sum up the current matching

771 print('Reference summary:')

772 for structure_sequence in structure_sequences:

773 name = structure_sequence['name']

774 match = structure_sequence.get('match', None)

775 if not match:

776 print(' ' + name + ' -> Not protein')

777 continue

778 reference = structure_sequence['match'].get('ref', None)

779 if not reference:

780 print(' ' + name + ' -> ¿?')

781 continue

782 uniprot_id = reference['uniprot']

783 print(' ' + name + ' -> ' + uniprot_id)

784 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not

785 return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences])

786 # If we have every protein chain matched with a reference already then we stop here

787 if match_sequences():

788 return format_topology_data(structure, protein_sequences)

789 # If there are still any chain which is not matched with a reference then we need more references

790 # To get them, we run a blast with each orphan chain sequence

791 for structure_sequence in protein_sequences:

792 # Skip already references chains

793 if structure_sequence['match']['ref']:

794 continue

795 # Run the blast

796 sequence = structure_sequence['sequence']

797 uniprot_id = blast(sequence)

798 # Build a new reference from the resulting uniprot

799 reference = get_uniprot_reference(uniprot_id)

800 reference_sequences[reference['uniprot']] = reference['sequence']

801 # Save the current whole reference object for later

802 references[reference['uniprot']] = reference

803 # If we have every protein chain matched with a reference already then we stop here

804 if match_sequences():

805 return format_topology_data(structure, protein_sequences)

806 print('The BLAST failed to find a matching reference sequence for at least one protein sequence')

807 return None

808

809

810# Try to match all protein sequences with the available reference sequences

811# In case of match, objects in the 'protein_sequences' list are modified by adding the result

812# Finally, return True if all protein sequences were matched with the available reference sequences or False if not

813def match_sequences(protein_sequences: list, reference_sequences: dict) -> bool:

814 # Track each chain-reference alignment match and keep the score of successful alignments

815 # Now for each structure sequence, align all reference sequences and keep the best alignment(if it meets the minimum)

816 for structure_sequence in protein_sequences:

817 for uniprot_id, reference_sequence in reference_sequences.items():

818 # Align the structure sequence with the reference sequence

819 align_results = align(reference_sequence, structure_sequence['sequence'])

820 if not align_results:

821 continue

822 # In case we have a valid alignment, check the alignment score is better than the current reference score(if any)

823 sequence_map, align_score = align_results

824 current_reference = structure_sequence['match']

825 if current_reference['score'] > align_score:

826 continue

827 reference = references[uniprot_id]

828

829 # If the alignment is better then we impose the new reference

830 structure_sequence['match'] = {'ref': reference, 'map': sequence_map, 'score': align_score}

831 # Finally, return True if all protein sequences were matched with the available reference sequences or False if not

832 return all([structure_sequence['match']['ref'] for structure_sequence in protein_sequences])

833

834

835# Reformat mapping data to the topology system(introduced later)

836def format_topology_data(structure: 'Structure', mapping_data: list) -> dict:

837 # Get the count of residues from the structure

838 residues_count = len(structure.residues)

839 # Now format data

840 reference_ids = []

841 residue_reference_indices = [None] * residues_count

842 residue_reference_numbers = [None] * residues_count

843 for data in mapping_data:

844 match = data['match']

845 # Get the reference index

846 # Note that several matches may belong to the same reference and thus have the same index

847 reference = match['ref']

848 uniprot_id = reference['uniprot']

849 if uniprot_id not in reference_ids:

850 reference_ids.append(reference['uniprot'])

851 reference_index = reference_ids.index(uniprot_id)

852 residue_indices = data['residue_indices']

853 for r, residue_number in enumerate(match['map']):

854 if residue_number is None:

855 continue

856 residue_index = residue_indices[r]

857 residue_reference_indices[residue_index] = reference_index

858 residue_reference_numbers[residue_index] = residue_number

859 # If there are not references at the end then set all fields as None, in order to save space

860 if len(reference_ids) == 0:

861 reference_ids = None

862 residue_reference_indices = None

863 residue_reference_numbers = None

864 # Return the 3 topology fields as they are in the database

865 residues_map = {'references': reference_ids, 'residue_reference_indices': residue_reference_indices, 'residue_reference_numbers': residue_reference_numbers}

866 return residues_map

867

868

869# Align a reference aminoacid sequence with each chain sequence in a topology

870# NEVER FORGET: This system relies on the fact that topology chains are not repeated

871def map_sequence(ref_sequence: str, structure: 'Structure') -> list:

872 sequences = get_chain_sequences(structure)

873 mapping = []

874 for s in sequences:

875 sequence = sequences[s]

876 sequence_map = align(ref_sequence, sequence)

877 mapping += sequence_map

878 return mapping

879

880

881# Get each chain name and aminoacids sequence in a topology

882# Output format example: [{'sequence': 'VNLTT', 'indices': [1, 2, 3, 4, 5]}, ...]

883def get_chain_sequences(structure: 'Structure') -> list:

884 sequences = []

885 chains = structure.chains

886 for chain in chains:

887 name = chain.name

888 sequence = ''

889 residue_indices = []

890 for residue in chain.residues:

891 letter = residue_name_2_letter(residue.name, 'aminoacids')

892 sequence += letter

893 residue_indices.append(residue.index)

894 # Save sequences by chain name (i.e. chain id or chain letter)

895 sequence_object = {'name': name, 'sequence': sequence, 'residue_indices': residue_indices}

896 sequences.append(sequence_object)

897 return sequences

898

899

900# Align two aminoacid sequences

901# Return a list with the reference residue indexes (values)

902# which match each new sequence residues indexes (indexes)

903# Return also the score of the alignment

904# Return None when there is not valid alignment at all

905def align(ref_sequence: str, new_sequence: str) -> Optional[Tuple[list, float]]:

906

907 # print('- REFERENCE\n' + ref_sequence + '\n- NEW\n' + new_sequence)

908

909 # If the new sequence is all 'X' stop here, since this would make the alignment infinite

910 # Then an array filled with None is returned

911 if all([letter == 'X' for letter in new_sequence]):

912 return None

913

914 # Return the new sequence as best aligned as possible with the reference sequence

915 alignments = pairwise2.align.localds(ref_sequence, new_sequence, blosum62, -10, -0.5)

916 # DANI: Habría que hacerlo de esta otra forma según el deprecation warning (arriba hay más código)

917 # DANI: El problema es que el output lo tiene todo menos la sequencia en formato alienada

918 # DANI: i.e. formato '----VNLTT', que es justo el que necesito

919 # alignments = aligner.align(ref_sequence, new_sequence)

920

921 # In case there are no alignments it means the current chain has nothing to do with this reference

922 # Then an array filled with None is returned

923 if len(alignments) == 0:

924 return None

925

926 # Several alignments may be returned, specially when it is a difficult or impossible alignment

927 # Output format example: '----VNLTT'

928 best_alignment = alignments[0]

929 aligned_sequence = best_alignment[1]

930 print(format_alignment(*alignments[0]))

931 score = alignments[0][2]

932 # WARNING: Do not use 'aligned_sequence' length here since it has the total sequence length

933 normalized_score = score / len(new_sequence)

934 print('Normalized score: ' + str(normalized_score))

935

936 # If the normalized score does not reaches the minimum we consider the alignment is not valid

937 # It may happen when the reference goes for a specific chain but we must map all chains

938 # This 1 has been found experimentally

939 # Non maching sequence may return a 0.1-0.3 normalized score

940 # Matching sequence may return >4 normalized score

941 if normalized_score < 1:

942 print('Not valid alignment')

943 return None

944

945 # Match each residue

946 aligned_mapping = []

947 aligned_index = 0

948 for l_index, letter in enumerate(aligned_sequence):

949 # Guions are skipped

950 if letter == '-':

951 continue

952 # Get the current residue of the new sequence

953 equivalent_letter = new_sequence[aligned_index]

954 if not letter == equivalent_letter:

955 raise SystemExit('Something was wrong:S')

956 # 'X' residues cannot be mapped since reference sequences should never have any 'X'

957 if letter == 'X':

958 aligned_mapping.append(None)

959 aligned_index += 1

960 continue

961 # Otherwise add the equivalent aligned index to the mapping

962 # WARNING: Add +1 since uniprot residue counts start at 1, not 0

963 aligned_mapping.append(l_index + 1)

964 aligned_index += 1

965

966 return aligned_mapping, normalized_score

967

968

969# Given an aminoacids sequence, return a list of uniprot ids

970# Note that we are blasting against UniProtKB / Swiss-Prot so results will always be valid UniProt accessions

971# WARNING: This always means results will correspond to curated entries only

972# If your sequence is from an exotic organism the result may be not from it but from other more studied organism

973def blast(sequence: str) -> List[str]:

974 print('Throwing blast... (this may take a few minutes)')

975 result = NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence)

976 parsed_result = xmltodict.parse(result.read())

977 hits = parsed_result['BlastOutput']['BlastOutput_iterations']['Iteration']['Iteration_hits']['Hit']

978 # Get the first result only

979 result = hits[0]

980 # Return the accession

981 # DANI: Si algun día tienes problemas porque te falta el '.1' al final del accession puedes sacarlo de Hit_id

982 accession = result['Hit_accession']

983 print('Result: ' + accession)

984 print(result['Hit_def'])

985 return accession

986

987

988# Given a uniprot accession, use the uniprot API to request its data and then mine what is needed for the database

989def get_uniprot_reference(uniprot_accession: str) -> dict:

990 # Request Uniprot

991 request_url = 'https://www.ebi.ac.uk/proteins/api/proteins/' + uniprot_accession

992 try:

993 with urllib.request.urlopen(request_url) as response:

994 parsed_response = json.loads(response.read().decode("utf-8"))

995 # If the accession is not found in UniProt then the id is not valid

996 except urllib.error.HTTPError as error:

997 if error.code == 400:

998 raise ValueError('Something went wrong with the Uniprot request: ' + request_url)

999 # Get the aminoacids sequence

1000 sequence = parsed_response['sequence']['sequence']

1001 return {'uniprot': uniprot_accession, 'sequence': sequence}

Coverage for biobb_model/model/fix_pdb_utils.py: 63%

542 statements