Coverage for biobb_godmd/godmd/godmd_prep.py: 87%
163 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-25 07:50 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-25 07:50 +0000
1#!/usr/bin/env python3
3"""Module containing the GOdMDPrep class and the command line interface."""
4import argparse
5from typing import Optional
6from pathlib import Path
7from biobb_common.generic.biobb_object import BiobbObject
8from biobb_common.configuration import settings
9from biobb_common.tools import file_utils as fu
10from biobb_common.tools.file_utils import launchlogger
11from biobb_godmd.godmd.common import check_input_path, check_output_path
14class GOdMDPrep(BiobbObject):
15 """
16 | biobb_godmd GOdMDPrep
17 | Helper bb to prepare inputs for the `GOdMD tool <http://mmb.irbbarcelona.org/GOdMD/>`_ module.
18 | Prepares input files for the GOdMD tool.
20 Args:
21 input_pdb_orig_path (str): Input PDB file to be used as origin in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).
22 input_pdb_target_path (str): Input PDB file to be used as target in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).
23 output_aln_orig_path (str): Output GOdMD alignment file corresponding to the origin structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).
24 output_aln_target_path (str): Output GOdMD alignment file corresponding to the target structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).
25 properties (dict - Python dictionary object containing the tool parameters, not input/output files):
26 * **gapopen** (*float*) - (12.0) Standard gap penalty: score taken away when a gap is created.
27 * **gapextend** (*float*) - (2.0) Penalty added to the standard gap penalty for each base or residue in the gap.
28 * **datafile** (*str*) - ("EPAM250") Scoring matrix file used when comparing sequences.
29 * **binary_path** (*str*) - ("water") Binary path.
30 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
31 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.
32 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.
34 Examples:
35 This is a use example of how to use the building block from Python::
37 from biobb_godmd.godmd.godmd_prep import godmd_prep
38 prop = {
39 'gapopen': 10.0,
40 'gapextend': 2.0
41 }
42 godmd_prep( input_pdb_orig_path='/path/to/input_orig.pdb',
43 input_pdb_target_path='/path/to/input_target.pdb',
44 output_aln_orig_path='/path/to/orig.aln',
45 output_aln_target_path='/path/to/target.aln',
46 properties=prop)
48 Info:
49 * wrapped_software:
50 * name: In house
51 * license: Apache-2.0
52 * ontology:
53 * name: EDAM
54 * schema: http://edamontology.org/EDAM.owl
56 """
58 def __init__(self, input_pdb_orig_path: str, input_pdb_target_path: str,
59 output_aln_orig_path: str, output_aln_target_path: str,
60 properties: Optional[dict] = None, **kwargs) -> None:
62 properties = properties or {}
64 # Call parent class constructor
65 super().__init__(properties)
66 self.locals_var_dict = locals().copy()
68 self.AA_TRANSLATOR = {'GLY': 'G',
69 'ALA': 'A',
70 'VAL': 'V',
71 'LEU': 'L',
72 'ILE': 'I',
73 'MET': 'M',
74 'PHE': 'F',
75 'TRP': 'W',
76 'PRO': 'P',
77 'SER': 'S',
78 'THR': 'T',
79 'CYS': 'C',
80 'CYX': 'C',
81 'CYM': 'C',
82 'TYR': 'Y',
83 'TYM': 'Y',
84 'ASN': 'N',
85 'GLN': 'Q',
86 'ASP': 'D',
87 'ASH': 'D',
88 'GLU': 'E',
89 'GLH': 'E',
90 'LYS': 'K',
91 'LYN': 'K',
92 'ARG': 'R',
93 'ARN': 'R',
94 'HIS': 'H',
95 'HIE': 'H',
96 'HID': 'H',
97 'HIP': 'H'}
99 # Input/Output files
100 self.io_dict = {
101 'in': {'input_pdb_orig_path': input_pdb_orig_path,
102 'input_pdb_target_path': input_pdb_target_path},
103 'out': {'output_aln_orig_path': output_aln_orig_path,
104 'output_aln_target_path': output_aln_target_path}
105 }
107 # Properties specific for BB
108 self.properties = properties
109 self.gapopen = properties.get('gapopen', 12.0)
110 self.gapextend = properties.get('gapextend', 2.0)
111 self.datafile = properties.get('datafile', "EPAM250")
112 self.binary_path = properties.get('binary_path', "water")
114 # Check the properties
115 self.check_properties(properties)
116 self.check_arguments()
118 def check_data_params(self, out_log, out_err):
119 """ Checks input/output paths correctness """
121 # Check input(s)
122 self.io_dict["in"]["input_pdb_orig_path"] = check_input_path(self.io_dict["in"]["input_pdb_orig_path"], "input_pdb_orig_path", False, out_log, self.__class__.__name__)
123 self.io_dict["in"]["input_pdb_target_path"] = check_input_path(self.io_dict["in"]["input_pdb_target_path"], "input_pdb_target_path", False, out_log, self.__class__.__name__)
125 # Check output(s)
126 self.io_dict["out"]["output_aln_orig_path"] = check_output_path(self.io_dict["out"]["output_aln_orig_path"], "output_aln_orig_path", False, out_log, self.__class__.__name__)
127 self.io_dict["out"]["output_aln_target_path"] = check_output_path(self.io_dict["out"]["output_aln_target_path"], "output_aln_target_path", False, out_log, self.__class__.__name__)
129 def extract_sequence(self, pdb):
130 '''
131 Parses a PDB file to retrieve the sequence of a certain chain.
132 '''
133 seq = ''
134 resids = []
135 with open(pdb) as file:
136 for line in file:
137 line = line.rstrip('\n')
138 record = line[:6].replace(" ", "")
139 if record != 'ATOM':
140 continue
141 atomname = line[12:16]
142 atomname = atomname.replace(" ", "")
143 if atomname != 'CA':
144 continue
145 alternate = line[16]
146 if alternate != ' ':
147 if alternate != 'A':
148 continue # if alt loc, only A-labeled residues
149 resname = line[17:20].replace(" ", "")
150 if resname not in self.AA_TRANSLATOR:
151 continue
152 resnum = line[22:26].replace(" ", "")
153 icode = line[26:27]
154 seq += self.AA_TRANSLATOR[resname]
155 resids += [(' ', int(resnum), icode)] # BioPython's standard PDB atom id (' ',resid,insertioncode) (e.g. (' ',40,'B'))
157 return seq, resids
159 def retrieve_alignment(self, waterFile, resids1, resids2):
160 '''
161 Gets the FASTA sequence of two PDB structures and a list of their residues in Biopython residue format.
162 Opens a file containing a local sequence alignment (Water program of EMBOSS package) between those two PDB structures.
163 Returns the sequence identity and the pairs of residues of the alignment.
164 '''
165 # Parsing Water alignment file
166 is_alignment = 0
167 is_qseq = 0
168 is_sseq = 0
169 qseq = ""
170 sseq = ""
171 ident = ""
172 next_seq = 0
173 qstart = ""
174 sstart = ""
175 with open(waterFile) as file:
176 for line in file:
177 line = line.rstrip("\n")
178 if is_alignment == 0:
179 if not line.startswith(">>#1"):
180 continue
181 else:
182 is_alignment = 1
183 else:
184 if line.startswith("; sw_ident:"):
185 ident = line[12:]
186 elif line.startswith("; al_start:"):
187 if next_seq == 0:
188 qstart = line[12:]
189 else:
190 sstart = line[12:]
191 elif line.startswith("; al_display_start:"):
192 if next_seq == 0:
193 is_qseq = 1
194 else:
195 is_sseq = 1
196 else:
197 if is_qseq == 1:
198 if not line.startswith(">"):
199 qseq = qseq + line
200 else:
201 is_qseq = 0
202 next_seq = 1
203 if is_sseq == 1:
204 if line != "":
205 sseq = sseq + line
206 else:
207 is_sseq = 0
209 # Get the Sequence Identity of the alignment
210 seq_id = float(ident)
212 # Get the residues of the alignment.
213 # -2 is applied because in the first iteration of next loop all values will always be increased by 1 (local alignment)
214 # and we also need to convert the number into a python index to retrieve the proper token in the list
215 resid_pairs = []
216 idx1 = int(qstart)-2
217 idx2 = int(sstart)-2
218 for i in range(len(qseq)): # it could be also len(sseq)
219 if qseq[i] != '-':
220 idx1 += 1
221 if sseq[i] != '-':
222 idx2 += 1
223 if qseq[i] == '-' or sseq[i] == '-':
224 continue
225 resid_pairs += [(resids1[idx1], resids2[idx2])]
226 # Check contents of the residues of the alignment
227 if len(resid_pairs) == 0: # Alignment file was empty or there was no possible matching residues between the PDBs
228 fu.log('Alignment file was empty or there was no possible matching residues between the PDBs' % self.stage_io_dict["unique_dir"], self.out_log)
229 return False, False
230 else:
231 return seq_id, resid_pairs
233 @launchlogger
234 def launch(self):
235 """Launches the execution of the GOdMDPrep module."""
237 # check input/output paths and parameters
238 self.check_data_params(self.out_log, self.err_log)
240 # Setup Biobb
241 if self.check_restart():
242 return 0
243 self.stage_files()
245 # Starting the work...
246 # Parsing the input PDB files
247 pdb1 = self.io_dict["in"]["input_pdb_orig_path"]
248 pdb2 = self.io_dict["in"]["input_pdb_target_path"]
250 # Generate sequence of first PDB
251 seq1, resids1 = self.extract_sequence(pdb1)
253 # Generate sequence of second PDB
254 seq2, resids2 = self.extract_sequence(pdb2)
256 if len(seq1) < 50:
257 fu.log('WARNING: Short sequence (ORIGIN)' % self.stage_io_dict["unique_dir"], self.out_log)
258 if len(seq2) < 50:
259 fu.log('WARNING: Short sequence (TARGET)' % self.stage_io_dict["unique_dir"], self.out_log)
261 # Produce FASTA files
262 name1 = "fasta1.fa"
263 name2 = "fasta2.fa"
264 fasta1Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name1))
265 fasta1File = open(fasta1Filename, "w")
266 fasta1File.write(">"+pdb1+"\n"+seq1)
267 fasta1File.close()
268 fasta2Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name2))
269 fasta2File = open(fasta2Filename, "w")
270 fasta2File.write(">"+pdb2+"\n"+seq2)
271 fasta2File.close()
273 waterFilename = str(Path(self.stage_io_dict["unique_dir"]).joinpath("water_align.out"))
275 # water -auto -outfile=water_align.out -asequence=1ake.chains.nolig.pdb.fa
276 # -bsequence=4ake.chains.pdb.fa -gapopen=12 -gapextend=2
277 # -datafile=EPAM250 -aformat=markx10
279 # Command line
280 self.cmd = [self.binary_path,
281 '-auto',
282 '-outfile', waterFilename,
283 '-asequence', fasta1Filename,
284 '-bsequence', fasta2Filename,
285 '-gapopen', str(self.gapopen),
286 '-gapextend', str(self.gapextend),
287 '-datafile', self.datafile,
288 '-aformat', "markx10"
289 ]
291 # Run Biobb block
292 self.run_biobb()
294 # Starting post-alignment process: generating .aln files
296 # Retrieve sequence identity of the pair, based on local sequence alignment
297 # previously computed, and generate the pairs of residues from both structres
298 # to drive the superimposition
299 seq_id, resid_pairs = self.retrieve_alignment(waterFilename, resids1, resids2)
301 aln1Filename = self.io_dict["out"]["output_aln_orig_path"]
302 aln1File = open(aln1Filename, 'w')
303 for pair in resid_pairs:
304 aln1File.write("%s%s\n" % (pair[0][1], pair[0][2].replace(" ", "")))
305 aln1File.close()
307 aln2Filename = self.io_dict["out"]["output_aln_target_path"]
308 aln2File = open(aln2Filename, 'w')
309 for pair in resid_pairs:
310 aln2File.write("%s%s\n" % (pair[1][1], pair[1][2].replace(" ", "")))
311 aln2File.close()
313 # Copy files to host
314 self.copy_to_host()
316 # remove temporary folder(s)
317 # self.tmp_files.extend([
318 # self.stage_io_dict.get("unique_dir", "")
319 # ])
320 self.remove_tmp_files()
322 self.check_arguments(output_files_created=True, raise_exception=False)
324 return self.return_code
327def godmd_prep(input_pdb_orig_path: str, input_pdb_target_path: str,
328 output_aln_orig_path: str, output_aln_target_path: str,
329 properties: Optional[dict] = None, **kwargs) -> int:
330 """Create :class:`GOdMDPrep <godmd.godmd_prep.GOdMDPrep>`godmd.godmd_prep.GOdMDPrep class and
331 execute :meth:`launch() <godmd.godmd_prep.GOdMDPrep.launch>` method"""
333 return GOdMDPrep(input_pdb_orig_path=input_pdb_orig_path,
334 input_pdb_target_path=input_pdb_target_path,
335 output_aln_orig_path=output_aln_orig_path,
336 output_aln_target_path=output_aln_target_path,
337 properties=properties).launch()
339 godmd_prep.__doc__ = GOdMDPrep.__doc__
342def main():
343 parser = argparse.ArgumentParser(description='Prepares input files for the GOdMD tool.', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
344 parser.add_argument('--config', required=False, help='Configuration file')
346 # Specific args
347 required_args = parser.add_argument_group('required arguments')
348 required_args.add_argument('--input_pdb_orig_path', required=True, help='Input PDB file to be used as origin in the conformational transition. Accepted formats: pdb.')
349 required_args.add_argument('--input_pdb_target_path', required=True, help='Input PDB file to be used as target in the conformational transition. Accepted formats: pdb.')
350 required_args.add_argument('--output_aln_orig_path', required=True, help='Output GOdMD alignment file corresponding to the origin structure of the conformational transition. Accepted formats: aln, txt.')
351 required_args.add_argument('--output_aln_target_path', required=True, help='Output GOdMD alignment file corresponding to the target structure of the conformational transition. Accepted formats: aln, txt.')
353 args = parser.parse_args()
354 # config = args.config if args.config else None
355 args.config = args.config or "{}"
356 # properties = settings.ConfReader(config=config).get_prop_dic()
357 properties = settings.ConfReader(config=args.config).get_prop_dic()
359 # Specific call
360 godmd_prep(input_pdb_orig_path=args.input_pdb_orig_path,
361 input_pdb_target_path=args.input_pdb_target_path,
362 output_aln_orig_path=args.output_aln_orig_path,
363 output_aln_target_path=args.output_aln_target_path,
364 properties=properties)
367if __name__ == '__main__':
368 main()