Coverage for biobb_godmd/godmd/godmd

1#!/usr/bin/env python3

3"""Module containing the GOdMDPrep class and the command line interface."""

4import argparse

5from typing import Optional

6from pathlib import Path

7from biobb_common.generic.biobb_object import BiobbObject

8from biobb_common.configuration import settings

9from biobb_common.tools import file_utils as fu

10from biobb_common.tools.file_utils import launchlogger

11from biobb_godmd.godmd.common import check_input_path, check_output_path

14class GOdMDPrep(BiobbObject):

15 """

16 | biobb_godmd GOdMDPrep

17 | Helper bb to prepare inputs for the `GOdMD tool <http://mmb.irbbarcelona.org/GOdMD/>`_ module.

18 | Prepares input files for the GOdMD tool.

20 Args:

21 input_pdb_orig_path (str): Input PDB file to be used as origin in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).

22 input_pdb_target_path (str): Input PDB file to be used as target in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).

23 output_aln_orig_path (str): Output GOdMD alignment file corresponding to the origin structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).

24 output_aln_target_path (str): Output GOdMD alignment file corresponding to the target structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).

25 properties (dict - Python dictionary object containing the tool parameters, not input/output files):

26 * **gapopen** (*float*) - (12.0) Standard gap penalty: score taken away when a gap is created.

27 * **gapextend** (*float*) - (2.0) Penalty added to the standard gap penalty for each base or residue in the gap.

28 * **datafile** (*str*) - ("EPAM250") Scoring matrix file used when comparing sequences.

29 * **binary_path** (*str*) - ("water") Binary path.

30 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.

31 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

32 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory.

34 Examples:

35 This is a use example of how to use the building block from Python::

37 from biobb_godmd.godmd.godmd_prep import godmd_prep

38 prop = {

39 'gapopen': 10.0,

40 'gapextend': 2.0

41 }

42 godmd_prep( input_pdb_orig_path='/path/to/input_orig.pdb',

43 input_pdb_target_path='/path/to/input_target.pdb',

44 output_aln_orig_path='/path/to/orig.aln',

45 output_aln_target_path='/path/to/target.aln',

46 properties=prop)

48 Info:

49 * wrapped_software:

50 * name: In house

51 * license: Apache-2.0

52 * ontology:

53 * name: EDAM

54 * schema: http://edamontology.org/EDAM.owl

56 """

58 def __init__(self, input_pdb_orig_path: str, input_pdb_target_path: str,

59 output_aln_orig_path: str, output_aln_target_path: str,

60 properties: Optional[dict] = None, **kwargs) -> None:

62 properties = properties or {}

64 # Call parent class constructor

65 super().__init__(properties)

66 self.locals_var_dict = locals().copy()

68 self.AA_TRANSLATOR = {'GLY': 'G',

69 'ALA': 'A',

70 'VAL': 'V',

71 'LEU': 'L',

72 'ILE': 'I',

73 'MET': 'M',

74 'PHE': 'F',

75 'TRP': 'W',

76 'PRO': 'P',

77 'SER': 'S',

78 'THR': 'T',

79 'CYS': 'C',

80 'CYX': 'C',

81 'CYM': 'C',

82 'TYR': 'Y',

83 'TYM': 'Y',

84 'ASN': 'N',

85 'GLN': 'Q',

86 'ASP': 'D',

87 'ASH': 'D',

88 'GLU': 'E',

89 'GLH': 'E',

90 'LYS': 'K',

91 'LYN': 'K',

92 'ARG': 'R',

93 'ARN': 'R',

94 'HIS': 'H',

95 'HIE': 'H',

96 'HID': 'H',

97 'HIP': 'H'}

99 # Input/Output files

100 self.io_dict = {

101 'in': {'input_pdb_orig_path': input_pdb_orig_path,

102 'input_pdb_target_path': input_pdb_target_path},

103 'out': {'output_aln_orig_path': output_aln_orig_path,

104 'output_aln_target_path': output_aln_target_path}

105 }

106

107 # Properties specific for BB

108 self.properties = properties

109 self.gapopen = properties.get('gapopen', 12.0)

110 self.gapextend = properties.get('gapextend', 2.0)

111 self.datafile = properties.get('datafile', "EPAM250")

112 self.binary_path = properties.get('binary_path', "water")

113

114 # Check the properties

115 self.check_properties(properties)

116 self.check_arguments()

117

118 def check_data_params(self, out_log, out_err):

119 """ Checks input/output paths correctness """

120

121 # Check input(s)

122 self.io_dict["in"]["input_pdb_orig_path"] = check_input_path(self.io_dict["in"]["input_pdb_orig_path"], "input_pdb_orig_path", False, out_log, self.__class__.__name__)

123 self.io_dict["in"]["input_pdb_target_path"] = check_input_path(self.io_dict["in"]["input_pdb_target_path"], "input_pdb_target_path", False, out_log, self.__class__.__name__)

124

125 # Check output(s)

126 self.io_dict["out"]["output_aln_orig_path"] = check_output_path(self.io_dict["out"]["output_aln_orig_path"], "output_aln_orig_path", False, out_log, self.__class__.__name__)

127 self.io_dict["out"]["output_aln_target_path"] = check_output_path(self.io_dict["out"]["output_aln_target_path"], "output_aln_target_path", False, out_log, self.__class__.__name__)

128

129 def extract_sequence(self, pdb):

130 '''

131 Parses a PDB file to retrieve the sequence of a certain chain.

132 '''

133 seq = ''

134 resids = []

135 with open(pdb) as file:

136 for line in file:

137 line = line.rstrip('\n')

138 record = line[:6].replace(" ", "")

139 if record != 'ATOM':

140 continue

141 atomname = line[12:16]

142 atomname = atomname.replace(" ", "")

143 if atomname != 'CA':

144 continue

145 alternate = line[16]

146 if alternate != ' ':

147 if alternate != 'A':

148 continue # if alt loc, only A-labeled residues

149 resname = line[17:20].replace(" ", "")

150 if resname not in self.AA_TRANSLATOR:

151 continue

152 resnum = line[22:26].replace(" ", "")

153 icode = line[26:27]

154 seq += self.AA_TRANSLATOR[resname]

155 resids += [(' ', int(resnum), icode)] # BioPython's standard PDB atom id (' ',resid,insertioncode) (e.g. (' ',40,'B'))

156

157 return seq, resids

158

159 def retrieve_alignment(self, waterFile, resids1, resids2):

160 '''

161 Gets the FASTA sequence of two PDB structures and a list of their residues in Biopython residue format.

162 Opens a file containing a local sequence alignment (Water program of EMBOSS package) between those two PDB structures.

163 Returns the sequence identity and the pairs of residues of the alignment.

164 '''

165 # Parsing Water alignment file

166 is_alignment = 0

167 is_qseq = 0

168 is_sseq = 0

169 qseq = ""

170 sseq = ""

171 ident = ""

172 next_seq = 0

173 qstart = ""

174 sstart = ""

175 with open(waterFile) as file:

176 for line in file:

177 line = line.rstrip("\n")

178 if is_alignment == 0:

179 if not line.startswith(">>#1"):

180 continue

181 else:

182 is_alignment = 1

183 else:

184 if line.startswith("; sw_ident:"):

185 ident = line[12:]

186 elif line.startswith("; al_start:"):

187 if next_seq == 0:

188 qstart = line[12:]

189 else:

190 sstart = line[12:]

191 elif line.startswith("; al_display_start:"):

192 if next_seq == 0:

193 is_qseq = 1

194 else:

195 is_sseq = 1

196 else:

197 if is_qseq == 1:

198 if not line.startswith(">"):

199 qseq = qseq + line

200 else:

201 is_qseq = 0

202 next_seq = 1

203 if is_sseq == 1:

204 if line != "":

205 sseq = sseq + line

206 else:

207 is_sseq = 0

208

209 # Get the Sequence Identity of the alignment

210 seq_id = float(ident)

211

212 # Get the residues of the alignment.

213 # -2 is applied because in the first iteration of next loop all values will always be increased by 1 (local alignment)

214 # and we also need to convert the number into a python index to retrieve the proper token in the list

215 resid_pairs = []

216 idx1 = int(qstart)-2

217 idx2 = int(sstart)-2

218 for i in range(len(qseq)): # it could be also len(sseq)

219 if qseq[i] != '-':

220 idx1 += 1

221 if sseq[i] != '-':

222 idx2 += 1

223 if qseq[i] == '-' or sseq[i] == '-':

224 continue

225 resid_pairs += [(resids1[idx1], resids2[idx2])]

226 # Check contents of the residues of the alignment

227 if len(resid_pairs) == 0: # Alignment file was empty or there was no possible matching residues between the PDBs

228 fu.log('Alignment file was empty or there was no possible matching residues between the PDBs' % self.stage_io_dict["unique_dir"], self.out_log)

229 return False, False

230 else:

231 return seq_id, resid_pairs

232

233 @launchlogger

234 def launch(self):

235 """Launches the execution of the GOdMDPrep module."""

236

237 # check input/output paths and parameters

238 self.check_data_params(self.out_log, self.err_log)

239

240 # Setup Biobb

241 if self.check_restart():

242 return 0

243 self.stage_files()

244

245 # Starting the work...

246 # Parsing the input PDB files

247 pdb1 = self.io_dict["in"]["input_pdb_orig_path"]

248 pdb2 = self.io_dict["in"]["input_pdb_target_path"]

249

250 # Generate sequence of first PDB

251 seq1, resids1 = self.extract_sequence(pdb1)

252

253 # Generate sequence of second PDB

254 seq2, resids2 = self.extract_sequence(pdb2)

255

256 if len(seq1) < 50:

257 fu.log('WARNING: Short sequence (ORIGIN)' % self.stage_io_dict["unique_dir"], self.out_log)

258 if len(seq2) < 50:

259 fu.log('WARNING: Short sequence (TARGET)' % self.stage_io_dict["unique_dir"], self.out_log)

260

261 # Produce FASTA files

262 name1 = "fasta1.fa"

263 name2 = "fasta2.fa"

264 fasta1Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name1))

265 fasta1File = open(fasta1Filename, "w")

266 fasta1File.write(">"+pdb1+"\n"+seq1)

267 fasta1File.close()

268 fasta2Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name2))

269 fasta2File = open(fasta2Filename, "w")

270 fasta2File.write(">"+pdb2+"\n"+seq2)

271 fasta2File.close()

272

273 waterFilename = str(Path(self.stage_io_dict["unique_dir"]).joinpath("water_align.out"))

274

275 # water -auto -outfile=water_align.out -asequence=1ake.chains.nolig.pdb.fa

276 # -bsequence=4ake.chains.pdb.fa -gapopen=12 -gapextend=2

277 # -datafile=EPAM250 -aformat=markx10

278

279 # Command line

280 self.cmd = [self.binary_path,

281 '-auto',

282 '-outfile', waterFilename,

283 '-asequence', fasta1Filename,

284 '-bsequence', fasta2Filename,

285 '-gapopen', str(self.gapopen),

286 '-gapextend', str(self.gapextend),

287 '-datafile', self.datafile,

288 '-aformat', "markx10"

289 ]

290

291 # Run Biobb block

292 self.run_biobb()

293

294 # Starting post-alignment process: generating .aln files

295

296 # Retrieve sequence identity of the pair, based on local sequence alignment

297 # previously computed, and generate the pairs of residues from both structres

298 # to drive the superimposition

299 seq_id, resid_pairs = self.retrieve_alignment(waterFilename, resids1, resids2)

300

301 aln1Filename = self.io_dict["out"]["output_aln_orig_path"]

302 aln1File = open(aln1Filename, 'w')

303 for pair in resid_pairs:

304 aln1File.write("%s%s\n" % (pair[0][1], pair[0][2].replace(" ", "")))

305 aln1File.close()

306

307 aln2Filename = self.io_dict["out"]["output_aln_target_path"]

308 aln2File = open(aln2Filename, 'w')

309 for pair in resid_pairs:

310 aln2File.write("%s%s\n" % (pair[1][1], pair[1][2].replace(" ", "")))

311 aln2File.close()

312

313 # Copy files to host

314 self.copy_to_host()

315

316 # remove temporary folder(s)

317 # self.tmp_files.extend([

318 # self.stage_io_dict.get("unique_dir", "")

319 # ])

320 self.remove_tmp_files()

321

322 self.check_arguments(output_files_created=True, raise_exception=False)

323

324 return self.return_code

325

326

327def godmd_prep(input_pdb_orig_path: str, input_pdb_target_path: str,

328 output_aln_orig_path: str, output_aln_target_path: str,

329 properties: Optional[dict] = None, **kwargs) -> int:

330 """Create :class:`GOdMDPrep <godmd.godmd_prep.GOdMDPrep>`godmd.godmd_prep.GOdMDPrep class and

331 execute :meth:`launch() <godmd.godmd_prep.GOdMDPrep.launch>` method"""

332

333 return GOdMDPrep(input_pdb_orig_path=input_pdb_orig_path,

334 input_pdb_target_path=input_pdb_target_path,

335 output_aln_orig_path=output_aln_orig_path,

336 output_aln_target_path=output_aln_target_path,

337 properties=properties).launch()

338

339 godmd_prep.__doc__ = GOdMDPrep.__doc__

340

341

342def main():

343 parser = argparse.ArgumentParser(description='Prepares input files for the GOdMD tool.', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))

344 parser.add_argument('--config', required=False, help='Configuration file')

345

346 # Specific args

347 required_args = parser.add_argument_group('required arguments')

348 required_args.add_argument('--input_pdb_orig_path', required=True, help='Input PDB file to be used as origin in the conformational transition. Accepted formats: pdb.')

349 required_args.add_argument('--input_pdb_target_path', required=True, help='Input PDB file to be used as target in the conformational transition. Accepted formats: pdb.')

350 required_args.add_argument('--output_aln_orig_path', required=True, help='Output GOdMD alignment file corresponding to the origin structure of the conformational transition. Accepted formats: aln, txt.')

351 required_args.add_argument('--output_aln_target_path', required=True, help='Output GOdMD alignment file corresponding to the target structure of the conformational transition. Accepted formats: aln, txt.')

352

353 args = parser.parse_args()

354 # config = args.config if args.config else None

355 args.config = args.config or "{}"

356 # properties = settings.ConfReader(config=config).get_prop_dic()

357 properties = settings.ConfReader(config=args.config).get_prop_dic()

358

359 # Specific call

360 godmd_prep(input_pdb_orig_path=args.input_pdb_orig_path,

361 input_pdb_target_path=args.input_pdb_target_path,

362 output_aln_orig_path=args.output_aln_orig_path,

363 output_aln_target_path=args.output_aln_target_path,

364 properties=properties)

365

366

367if __name__ == '__main__':

368 main()

Coverage for biobb_godmd/godmd/godmd_prep.py: 87%

163 statements