Coverage for biobb_godmd / godmd / godmd_prep.py: 93%
150 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 11:45 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-22 11:45 +0000
1#!/usr/bin/env python3
3"""Module containing the GOdMDPrep class and the command line interface."""
4from typing import Optional
5from pathlib import Path
6from biobb_common.generic.biobb_object import BiobbObject
7from biobb_common.tools import file_utils as fu
8from biobb_common.tools.file_utils import launchlogger
9from biobb_godmd.godmd.common import check_input_path, check_output_path
12class GOdMDPrep(BiobbObject):
13 """
14 | biobb_godmd GOdMDPrep
15 | Helper bb to prepare inputs for the `GOdMD tool <http://mmb.irbbarcelona.org/GOdMD/>`_ module.
16 | Prepares input files for the GOdMD tool.
18 Args:
19 input_pdb_orig_path (str): Input PDB file to be used as origin in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/1ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).
20 input_pdb_target_path (str): Input PDB file to be used as target in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/4ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).
21 output_aln_orig_path (str): Output GOdMD alignment file corresponding to the origin structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/1ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).
22 output_aln_target_path (str): Output GOdMD alignment file corresponding to the target structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/main/biobb_godmd/test/data/godmd/4ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).
23 properties (dict - Python dictionary object containing the tool parameters, not input/output files):
24 * **gapopen** (*float*) - (12.0) Standard gap penalty: score taken away when a gap is created.
25 * **gapextend** (*float*) - (2.0) Penalty added to the standard gap penalty for each base or residue in the gap.
26 * **datafile** (*str*) - ("EPAM250") Scoring matrix file used when comparing sequences.
27 * **binary_path** (*str*) - ("water") Binary path.
28 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
29 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
30 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
32 Examples:
33 This is a use example of how to use the building block from Python::
35 from biobb_godmd.godmd.godmd_prep import godmd_prep
36 prop = {
37 'gapopen': 10.0,
38 'gapextend': 2.0
39 }
40 godmd_prep( input_pdb_orig_path='/path/to/input_orig.pdb',
41 input_pdb_target_path='/path/to/input_target.pdb',
42 output_aln_orig_path='/path/to/orig.aln',
43 output_aln_target_path='/path/to/target.aln',
44 properties=prop)
46 Info:
47 * wrapped_software:
48 * name: In house
49 * license: Apache-2.0
50 * ontology:
51 * name: EDAM
52 * schema: http://edamontology.org/EDAM.owl
54 """
56 def __init__(self, input_pdb_orig_path: str, input_pdb_target_path: str,
57 output_aln_orig_path: str, output_aln_target_path: str,
58 properties: Optional[dict] = None, **kwargs) -> None:
60 properties = properties or {}
62 # Call parent class constructor
63 super().__init__(properties)
64 self.locals_var_dict = locals().copy()
66 self.AA_TRANSLATOR = {'GLY': 'G',
67 'ALA': 'A',
68 'VAL': 'V',
69 'LEU': 'L',
70 'ILE': 'I',
71 'MET': 'M',
72 'PHE': 'F',
73 'TRP': 'W',
74 'PRO': 'P',
75 'SER': 'S',
76 'THR': 'T',
77 'CYS': 'C',
78 'CYX': 'C',
79 'CYM': 'C',
80 'TYR': 'Y',
81 'TYM': 'Y',
82 'ASN': 'N',
83 'GLN': 'Q',
84 'ASP': 'D',
85 'ASH': 'D',
86 'GLU': 'E',
87 'GLH': 'E',
88 'LYS': 'K',
89 'LYN': 'K',
90 'ARG': 'R',
91 'ARN': 'R',
92 'HIS': 'H',
93 'HIE': 'H',
94 'HID': 'H',
95 'HIP': 'H'}
97 # Input/Output files
98 self.io_dict = {
99 'in': {'input_pdb_orig_path': input_pdb_orig_path,
100 'input_pdb_target_path': input_pdb_target_path},
101 'out': {'output_aln_orig_path': output_aln_orig_path,
102 'output_aln_target_path': output_aln_target_path}
103 }
105 # Properties specific for BB
106 self.properties = properties
107 self.gapopen = properties.get('gapopen', 12.0)
108 self.gapextend = properties.get('gapextend', 2.0)
109 self.datafile = properties.get('datafile', "EPAM250")
110 self.binary_path = properties.get('binary_path', "water")
112 # Check the properties
113 self.check_properties(properties)
114 self.check_arguments()
116 def check_data_params(self, out_log, out_err):
117 """ Checks input/output paths correctness """
119 # Check input(s)
120 self.io_dict["in"]["input_pdb_orig_path"] = check_input_path(self.io_dict["in"]["input_pdb_orig_path"], "input_pdb_orig_path", False, out_log, self.__class__.__name__)
121 self.io_dict["in"]["input_pdb_target_path"] = check_input_path(self.io_dict["in"]["input_pdb_target_path"], "input_pdb_target_path", False, out_log, self.__class__.__name__)
123 # Check output(s)
124 self.io_dict["out"]["output_aln_orig_path"] = check_output_path(self.io_dict["out"]["output_aln_orig_path"], "output_aln_orig_path", False, out_log, self.__class__.__name__)
125 self.io_dict["out"]["output_aln_target_path"] = check_output_path(self.io_dict["out"]["output_aln_target_path"], "output_aln_target_path", False, out_log, self.__class__.__name__)
127 def extract_sequence(self, pdb):
128 '''
129 Parses a PDB file to retrieve the sequence of a certain chain.
130 '''
131 seq = ''
132 resids = []
133 with open(pdb) as file:
134 for line in file:
135 line = line.rstrip('\n')
136 record = line[:6].replace(" ", "")
137 if record != 'ATOM':
138 continue
139 atomname = line[12:16]
140 atomname = atomname.replace(" ", "")
141 if atomname != 'CA':
142 continue
143 alternate = line[16]
144 if alternate != ' ':
145 if alternate != 'A':
146 continue # if alt loc, only A-labeled residues
147 resname = line[17:20].replace(" ", "")
148 if resname not in self.AA_TRANSLATOR:
149 continue
150 resnum = line[22:26].replace(" ", "")
151 icode = line[26:27]
152 seq += self.AA_TRANSLATOR[resname]
153 resids += [(' ', int(resnum), icode)] # BioPython's standard PDB atom id (' ',resid,insertioncode) (e.g. (' ',40,'B'))
155 return seq, resids
157 def retrieve_alignment(self, waterFile, resids1, resids2):
158 '''
159 Gets the FASTA sequence of two PDB structures and a list of their residues in Biopython residue format.
160 Opens a file containing a local sequence alignment (Water program of EMBOSS package) between those two PDB structures.
161 Returns the sequence identity and the pairs of residues of the alignment.
162 '''
163 # Parsing Water alignment file
164 is_alignment = 0
165 is_qseq = 0
166 is_sseq = 0
167 qseq = ""
168 sseq = ""
169 ident = ""
170 next_seq = 0
171 qstart = ""
172 sstart = ""
173 with open(waterFile) as file:
174 for line in file:
175 line = line.rstrip("\n")
176 if is_alignment == 0:
177 if not line.startswith(">>#1"):
178 continue
179 else:
180 is_alignment = 1
181 else:
182 if line.startswith("; sw_ident:"):
183 ident = line[12:]
184 elif line.startswith("; al_start:"):
185 if next_seq == 0:
186 qstart = line[12:]
187 else:
188 sstart = line[12:]
189 elif line.startswith("; al_display_start:"):
190 if next_seq == 0:
191 is_qseq = 1
192 else:
193 is_sseq = 1
194 else:
195 if is_qseq == 1:
196 if not line.startswith(">"):
197 qseq = qseq + line
198 else:
199 is_qseq = 0
200 next_seq = 1
201 if is_sseq == 1:
202 if line != "":
203 sseq = sseq + line
204 else:
205 is_sseq = 0
207 # Get the Sequence Identity of the alignment
208 seq_id = float(ident)
210 # Get the residues of the alignment.
211 # -2 is applied because in the first iteration of next loop all values will always be increased by 1 (local alignment)
212 # and we also need to convert the number into a python index to retrieve the proper token in the list
213 resid_pairs = []
214 idx1 = int(qstart)-2
215 idx2 = int(sstart)-2
216 for i in range(len(qseq)): # it could be also len(sseq)
217 if qseq[i] != '-':
218 idx1 += 1
219 if sseq[i] != '-':
220 idx2 += 1
221 if qseq[i] == '-' or sseq[i] == '-':
222 continue
223 resid_pairs += [(resids1[idx1], resids2[idx2])]
224 # Check contents of the residues of the alignment
225 if len(resid_pairs) == 0: # Alignment file was empty or there was no possible matching residues between the PDBs
226 fu.log('Alignment file was empty or there was no possible matching residues between the PDBs' % self.stage_io_dict["unique_dir"], self.out_log)
227 return False, False
228 else:
229 return seq_id, resid_pairs
231 @launchlogger
232 def launch(self):
233 """Launches the execution of the GOdMDPrep module."""
235 # check input/output paths and parameters
236 self.check_data_params(self.out_log, self.err_log)
238 # Setup Biobb
239 if self.check_restart():
240 return 0
241 self.stage_files()
243 # Starting the work...
244 # Parsing the input PDB files
245 pdb1 = self.io_dict["in"]["input_pdb_orig_path"]
246 pdb2 = self.io_dict["in"]["input_pdb_target_path"]
248 # Generate sequence of first PDB
249 seq1, resids1 = self.extract_sequence(pdb1)
251 # Generate sequence of second PDB
252 seq2, resids2 = self.extract_sequence(pdb2)
254 if len(seq1) < 50:
255 fu.log('WARNING: Short sequence (ORIGIN)' % self.stage_io_dict["unique_dir"], self.out_log)
256 if len(seq2) < 50:
257 fu.log('WARNING: Short sequence (TARGET)' % self.stage_io_dict["unique_dir"], self.out_log)
259 # Produce FASTA files
260 name1 = "fasta1.fa"
261 name2 = "fasta2.fa"
262 fasta1Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name1))
263 fasta1File = open(fasta1Filename, "w")
264 fasta1File.write(">"+pdb1+"\n"+seq1)
265 fasta1File.close()
266 fasta2Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name2))
267 fasta2File = open(fasta2Filename, "w")
268 fasta2File.write(">"+pdb2+"\n"+seq2)
269 fasta2File.close()
271 waterFilename = str(Path(self.stage_io_dict["unique_dir"]).joinpath("water_align.out"))
273 # water -auto -outfile=water_align.out -asequence=1ake.chains.nolig.pdb.fa
274 # -bsequence=4ake.chains.pdb.fa -gapopen=12 -gapextend=2
275 # -datafile=EPAM250 -aformat=markx10
277 # Command line
278 self.cmd = [self.binary_path,
279 '-auto',
280 '-outfile', waterFilename,
281 '-asequence', fasta1Filename,
282 '-bsequence', fasta2Filename,
283 '-gapopen', str(self.gapopen),
284 '-gapextend', str(self.gapextend),
285 '-datafile', self.datafile,
286 '-aformat', "markx10"
287 ]
289 # Run Biobb block
290 self.run_biobb()
292 # Starting post-alignment process: generating .aln files
294 # Retrieve sequence identity of the pair, based on local sequence alignment
295 # previously computed, and generate the pairs of residues from both structres
296 # to drive the superimposition
297 seq_id, resid_pairs = self.retrieve_alignment(waterFilename, resids1, resids2)
299 aln1Filename = self.io_dict["out"]["output_aln_orig_path"]
300 aln1File = open(aln1Filename, 'w')
301 for pair in resid_pairs:
302 aln1File.write("%s%s\n" % (pair[0][1], pair[0][2].replace(" ", "")))
303 aln1File.close()
305 aln2Filename = self.io_dict["out"]["output_aln_target_path"]
306 aln2File = open(aln2Filename, 'w')
307 for pair in resid_pairs:
308 aln2File.write("%s%s\n" % (pair[1][1], pair[1][2].replace(" ", "")))
309 aln2File.close()
311 # Copy files to host
312 self.copy_to_host()
314 # Remove temporary folder(s)
315 self.remove_tmp_files()
317 self.check_arguments(output_files_created=True, raise_exception=False)
319 return self.return_code
322def godmd_prep(input_pdb_orig_path: str, input_pdb_target_path: str,
323 output_aln_orig_path: str, output_aln_target_path: str,
324 properties: Optional[dict] = None, **kwargs) -> int:
325 """Create :class:`GOdMDPrep <godmd.godmd_prep.GOdMDPrep>`godmd.godmd_prep.GOdMDPrep class and
326 execute :meth:`launch() <godmd.godmd_prep.GOdMDPrep.launch>` method"""
327 return GOdMDPrep(**dict(locals())).launch()
330godmd_prep.__doc__ = GOdMDPrep.__doc__
331main = GOdMDPrep.get_main(godmd_prep, "Prepares input files for the GOdMD tool.")
333if __name__ == '__main__':
334 main()