Coverage for biobb_chemistry/babelm/babel_add_hydrogens.py: 84%

87 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-12 09:28 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the BabelAddHydrogens class and the command line interface.""" 

4 

5import argparse 

6from typing import Optional 

7 

8from biobb_common.configuration import settings 

9from biobb_common.generic.biobb_object import BiobbObject 

10from biobb_common.tools.file_utils import launchlogger 

11 

12from biobb_chemistry.babelm.common import ( 

13 _from_string_to_list, 

14 check_input_path, 

15 check_output_path, 

16 get_coordinates, 

17 get_input_format, 

18 get_output_format, 

19 get_ph, 

20) 

21 

22 

23class BabelAddHydrogens(BiobbObject): 

24 """ 

25 | biobb_chemistry BabelAddHydrogens 

26 | This class is a wrapper of the Open Babel tool. 

27 | Adds hydrogens to a given structure or trajectory. Open Babel is a chemical toolbox designed to speak the many languages of chemical data. It's an open, collaborative project allowing anyone to search, convert, analyze, or store data from molecular modeling, chemistry, solid-state materials, biochemistry, or related areas. `Visit the official page <http://openbabel.org/wiki/Main_Page>`_. 

28 

29 Args: 

30 input_path (str): Path to the input file. File type: input. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/data/babel/babel.no.H.pdb>`_. Accepted formats: dat (edam:format_1637), ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), log (edam:format_2030), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033), xml (edam:format_2332), xtc (edam:format_3875). 

31 output_path (str): Path to the output file. File type: output. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/reference/babel/ref_babel.hydrogens.pdb>`_. Accepted formats: ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033). 

32 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

33 * **input_format** (*str*) - (None) Format of input file. If not provided, input_path extension will be taken. Values: dat (Information represented in a data record), ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), log (Events file), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xml (eXtensible Markup Language), xtc (Portable binary format for trajectories produced by GROMACS package). 

34 * **output_format** (*str*) - (None) Format of output file. If not provided, output_path extension will be taken. Values: ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xtc (Portable binary format for trajectories produced by GROMACS package). 

35 * **fs_input** (*list*) - (None) Format-specific input options. Values: b (disable automatic bonding), d (input file is in dlg -AutoDock docking log- format). 

36 * **fs_output** (*list*) - (["h"]) Format-specific output options. Values: b (enable automatic bonding), r (output as a rigid molecule), c (combine separate molecular pieces of input into a single rigid molecule), s (output as a flexible residue), p (preserve atom indices from input file), h (preserve hydrogens), n (preserve atom names). 

37 * **coordinates** (*int*) - (None) Type of coordinates: 2D or 3D. Values: 2 (2D coordinates), 3 (3D coordinates). 

38 * **effort** (*str*) - ("medium") Computational effort wanted to dedicate for the conformer generation coordinates calculations, only for 3D coordinates. Values: fastest (only generate coordinates, no force field or conformer search), fast (perform quick forcefield optimization), medium (forcefield optimization + fast conformer search), better (more optimization + fast conformer search), best (more optimization + significant conformer search). 

39 * **ph** (*float*) - (7.4) [0~14|0.1] Add hydrogens appropriate for pH. 

40 * **binary_path** (*str*) - ("obabel") Path to the obabel executable binary. 

41 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

42 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

43 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

44 * **container_path** (*str*) - (None) Container path definition. 

45 * **container_image** (*str*) - ('informaticsmatters/obabel:latest') Container image definition. 

46 * **container_volume_path** (*str*) - ('/tmp') Container volume path definition. 

47 * **container_working_dir** (*str*) - (None) Container working directory definition. 

48 * **container_user_id** (*str*) - (None) Container user_id definition. 

49 * **container_shell_path** (*str*) - ('/bin/bash') Path to default shell inside the container. 

50 

51 Examples: 

52 This is a use example of how to use the building block from Python:: 

53 

54 from biobb_chemistry.babelm.babel_add_hydrogens import babel_add_hydrogens 

55 prop = { 

56 'input_format': 'pdb', 

57 'output_format': 'pdb', 

58 'coordinates': 3, 

59 'ph': 7.4 

60 } 

61 babel_add_hydrogens(input_path='/path/to/myStructure.pdb', 

62 output_path='/path/to/newStructure.pdb', 

63 properties=prop) 

64 

65 Info: 

66 * wrapped_software: 

67 * name: Open Babel 

68 * version: 2.4.1 

69 * license: GNU 

70 * ontology: 

71 * name: EDAM 

72 * schema: http://edamontology.org/EDAM.owl 

73 

74 """ 

75 

76 def __init__(self, input_path, output_path, properties=None, **kwargs) -> None: 

77 properties = properties or {} 

78 

79 # Call parent class constructor 

80 super().__init__(properties) 

81 self.locals_var_dict = locals().copy() 

82 

83 # Input/Output files 

84 self.io_dict = { 

85 "in": {"input_path": input_path}, 

86 "out": {"output_path": output_path}, 

87 } 

88 

89 # Properties specific for BB 

90 self.input_format = properties.get("input_format", "") 

91 self.output_format = properties.get("output_format", "") 

92 self.fs_input = _from_string_to_list(properties.get("fs_input", None)) 

93 self.fs_output = _from_string_to_list(properties.get("fs_output", ["h"])) 

94 self.coordinates = properties.get("coordinates", "") 

95 self.effort = properties.get("effort", "medium") 

96 self.ph = properties.get("ph", "") 

97 self.binary_path = properties.get("binary_path", "obabel") 

98 self.properties = properties 

99 

100 # Check the properties 

101 self.check_properties(properties) 

102 self.check_arguments() 

103 

104 def check_data_params(self, out_log, err_log): 

105 """Checks all the input/output paths and parameters""" 

106 self.io_dict["in"]["input_path"] = check_input_path( 

107 self.io_dict["in"]["input_path"], out_log, self.__class__.__name__ 

108 ) 

109 self.io_dict["out"]["output_path"] = check_output_path( 

110 self.io_dict["out"]["output_path"], out_log, self.__class__.__name__ 

111 ) 

112 

113 def create_cmd(self, container_io_dict, out_log, err_log): 

114 """Creates the command line instruction using the properties file settings""" 

115 instructions_list = [] 

116 

117 # executable path 

118 instructions_list.append(self.binary_path) 

119 

120 # generating input 

121 infr = get_input_format( 

122 self.input_format, container_io_dict["in"]["input_path"], out_log 

123 ) 

124 iformat = "-i" + infr 

125 instructions_list.append(iformat) 

126 ipath = container_io_dict["in"]["input_path"] 

127 instructions_list.append(ipath) 

128 

129 # generating output 

130 oufr = get_output_format( 

131 self.output_format, container_io_dict["out"]["output_path"], out_log 

132 ) 

133 oformat = "-o" + oufr 

134 instructions_list.append(oformat) 

135 opath = "-O" + container_io_dict["out"]["output_path"] 

136 instructions_list.append(opath) 

137 

138 # adding coordinates 

139 crd = get_coordinates(self.coordinates, out_log) 

140 coordinates = "" 

141 if crd: 

142 coordinates = "--gen" + crd + "d" 

143 instructions_list.append(coordinates) 

144 

145 # checking pH 

146 p = get_ph(self.ph, out_log) 

147 

148 # adding H 

149 hydrogens = "-h" 

150 

151 # adding pH 

152 ph = "" 

153 if p: 

154 ph = "-p " + p 

155 instructions_list.append(ph) 

156 else: 

157 instructions_list.append(hydrogens) 

158 

159 # fs_input 

160 if self.fs_input is not None: 

161 for fsi in self.fs_input: 

162 instructions_list.append("-a" + fsi) 

163 

164 # fs_output 

165 if self.fs_output is not None: 

166 for fso in self.fs_output: 

167 instructions_list.append("-x" + fso) 

168 

169 # adding effort (only for 3D coordinates) 

170 if crd == "3": 

171 instructions_list.append("--" + self.effort) 

172 

173 return instructions_list 

174 

175 @launchlogger 

176 def launch(self) -> int: 

177 """Execute the :class:`BabelAddHydrogens <babelm.babel_add_hydrogens.BabelAddHydrogens>` babelm.babel_add_hydrogens.BabelAddHydrogens object.""" 

178 

179 # check input/output paths and parameters 

180 self.check_data_params(self.out_log, self.err_log) 

181 

182 # Setup Biobb 

183 if self.check_restart(): 

184 return 0 

185 self.stage_files() 

186 

187 # create command line instruction 

188 self.cmd = self.create_cmd(self.stage_io_dict, self.out_log, self.err_log) 

189 

190 # Run Biobb block 

191 self.run_biobb() 

192 

193 # Copy files to host 

194 self.copy_to_host() 

195 

196 # remove temporary folder(s) 

197 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

198 self.remove_tmp_files() 

199 

200 self.check_arguments(output_files_created=True, raise_exception=False) 

201 

202 return self.return_code 

203 

204 

205def babel_add_hydrogens( 

206 input_path: str, output_path: str, properties: Optional[dict] = None, **kwargs 

207) -> int: 

208 """Execute the :class:`BabelAddHydrogens <babelm.babel_add_hydrogens.BabelAddHydrogens>` class and 

209 execute the :meth:`launch() <babelm.babel_add_hydrogens.BabelAddHydrogens.launch>` method.""" 

210 

211 return BabelAddHydrogens( 

212 input_path=input_path, output_path=output_path, properties=properties, **kwargs 

213 ).launch() 

214 

215 babel_add_hydrogens.__doc__ = BabelAddHydrogens.__doc__ 

216 

217 

218def main(): 

219 """Command line execution of this building block. Please check the command line documentation.""" 

220 parser = argparse.ArgumentParser( 

221 description="Adds hydrogen atoms to small molecules.", 

222 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

223 ) 

224 parser.add_argument("--config", required=False, help="Configuration file") 

225 

226 # Specific args of each building block 

227 required_args = parser.add_argument_group("required arguments") 

228 required_args.add_argument( 

229 "--input_path", 

230 required=True, 

231 help="Path to the input file. Accepted formats: dat, ent, fa, fasta, gro, inp, log, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt, xml, xtc.", 

232 ) 

233 required_args.add_argument( 

234 "--output_path", 

235 required=True, 

236 help="Path to the output file. Accepted formats: ent, fa, fasta, gro, inp, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt.", 

237 ) 

238 

239 args = parser.parse_args() 

240 args.config = args.config or "{}" 

241 properties = settings.ConfReader(config=args.config).get_prop_dic() 

242 

243 # Specific call of each building block 

244 babel_add_hydrogens( 

245 input_path=args.input_path, output_path=args.output_path, properties=properties 

246 ) 

247 

248 

249if __name__ == "__main__": 

250 main()