Coverage for biobb_chemistry/babelm/babel_remove_hydrogens.py: 84%

87 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-12 09:28 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the BabelRemoveHydrogens class and the command line interface.""" 

4 

5import argparse 

6from typing import Optional 

7 

8from biobb_common.configuration import settings 

9from biobb_common.generic.biobb_object import BiobbObject 

10from biobb_common.tools.file_utils import launchlogger 

11 

12from biobb_chemistry.babelm.common import ( 

13 _from_string_to_list, 

14 check_input_path, 

15 check_output_path, 

16 get_coordinates, 

17 get_input_format, 

18 get_output_format, 

19 get_ph, 

20) 

21 

22 

23class BabelRemoveHydrogens(BiobbObject): 

24 """ 

25 | biobb_chemistry BabelRemoveHydrogens 

26 | This class is a wrapper of the Open Babel tool. 

27 | Removes hydrogens to a given structure or trajectory. Open Babel is a chemical toolbox designed to speak the many languages of chemical data. It's an open, collaborative project allowing anyone to search, convert, analyze, or store data from molecular modeling, chemistry, solid-state materials, biochemistry, or related areas. `Visit the official page <http://openbabel.org/wiki/Main_Page>`_. 

28 

29 Args: 

30 input_path (str): Path to the input file. File type: input. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/data/babel/babel.H.pdb>`_. Accepted formats: dat (edam:format_1637), ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), log (edam:format_2030), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033), xml (edam:format_2332), xtc (edam:format_3875). 

31 output_path (str): Path to the output file. File type: output. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/reference/babel/ref_babel.nohydrogens.pdb>`_. Accepted formats: ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033). 

32 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

33 * **input_format** (*str*) - (None) Format of input file. If not provided, input_path extension will be taken. Values: dat (Information represented in a data record), ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), log (Events file), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xml (eXtensible Markup Language), xtc (Portable binary format for trajectories produced by GROMACS package). 

34 * **output_format** (*str*) - (None) Format of output file. If not provided, output_path extension will be taken. Values: ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xtc (Portable binary format for trajectories produced by GROMACS package). 

35 * **fs_input** (*list*) - (None) Format-specific input options. Values: b (disable automatic bonding), d (input file is in dlg -AutoDock docking log- format). 

36 * **fs_output** (*list*) - (None) Format-specific output options. Values: b (enable automatic bonding), r (output as a rigid molecule), c (combine separate molecular pieces of input into a single rigid molecule), s (output as a flexible residue), p (preserve atom indices from input file), h (preserve hydrogens), n (preserve atom names). 

37 * **coordinates** (*int*) - (None) Type of coordinates: 2D or 3D. Values: 2 (2D coordinates), 3 (3D coordinates). 

38 * **effort** (*str*) - ("medium") Computational effort wanted to dedicate for the conformer generation coordinates calculations, only for 3D coordinates. Values: fastest (only generate coordinates, no force field or conformer search), fast (perform quick forcefield optimization), medium (forcefield optimization + fast conformer search), better (more optimization + fast conformer search), best (more optimization + significant conformer search). 

39 * **ph** (*float*) - (7.4) [0~14|0.1] Add hydrogens appropriate for pH. 

40 * **binary_path** (*str*) - ("obabel") Path to the obabel executable binary. 

41 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

42 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

43 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

44 * **container_path** (*str*) - (None) Container path definition. 

45 * **container_image** (*str*) - ('informaticsmatters/obabel:latest') Container image definition. 

46 * **container_volume_path** (*str*) - ('/tmp') Container volume path definition. 

47 * **container_working_dir** (*str*) - (None) Container working directory definition. 

48 * **container_user_id** (*str*) - (None) Container user_id definition. 

49 * **container_shell_path** (*str*) - ('/bin/bash') Path to default shell inside the container. 

50 

51 Examples: 

52 This is a use example of how to use the building block from Python:: 

53 

54 from biobb_chemistry.babelm.babel_remove_hydrogens import babel_remove_hydrogens 

55 prop = { 

56 'input_format': 'pdb', 

57 'output_format': 'pdb', 

58 'coordinates': 3, 

59 'ph': 7.4 

60 } 

61 babel_remove_hydrogens(input_path='/path/to/myStructure.pdb', 

62 output_path='/path/to/newStructure.pdb', 

63 properties=prop) 

64 

65 Info: 

66 * wrapped_software: 

67 * name: Open Babel 

68 * version: 2.4.1 

69 * license: GNU 

70 * ontology: 

71 * name: EDAM 

72 * schema: http://edamontology.org/EDAM.owl 

73 

74 """ 

75 

76 def __init__(self, input_path, output_path, properties=None, **kwargs) -> None: 

77 properties = properties or {} 

78 

79 # Call parent class constructor 

80 super().__init__(properties) 

81 self.locals_var_dict = locals().copy() 

82 

83 # Input/Output files 

84 self.io_dict = { 

85 "in": {"input_path": input_path}, 

86 "out": {"output_path": output_path}, 

87 } 

88 

89 # Properties specific for BB 

90 self.input_format = properties.get("input_format", "") 

91 self.output_format = properties.get("output_format", "") 

92 self.fs_input = _from_string_to_list(properties.get("fs_input", None)) 

93 self.fs_output = _from_string_to_list(properties.get("fs_output", None)) 

94 self.coordinates = properties.get("coordinates", "") 

95 self.effort = properties.get("effort", "medium") 

96 self.ph = properties.get("ph", "") 

97 self.binary_path = properties.get("binary_path", "obabel") 

98 self.properties = properties 

99 

100 # Check the properties 

101 self.check_properties(properties) 

102 self.check_arguments() 

103 

104 def check_data_params(self, out_log, err_log): 

105 """Checks all the input/output paths and parameters""" 

106 self.io_dict["in"]["input_path"] = check_input_path( 

107 self.io_dict["in"]["input_path"], out_log, self.__class__.__name__ 

108 ) 

109 self.io_dict["out"]["output_path"] = check_output_path( 

110 self.io_dict["out"]["output_path"], out_log, self.__class__.__name__ 

111 ) 

112 

113 def create_cmd(self, container_io_dict, out_log, err_log): 

114 """Creates the command line instruction using the properties file settings""" 

115 instructions_list = [] 

116 

117 # executable path 

118 instructions_list.append(self.binary_path) 

119 

120 # generating input 

121 infr = get_input_format( 

122 self.input_format, container_io_dict["in"]["input_path"], out_log 

123 ) 

124 iformat = "-i" + infr 

125 instructions_list.append(iformat) 

126 ipath = container_io_dict["in"]["input_path"] 

127 instructions_list.append(ipath) 

128 

129 # generating output 

130 oufr = get_output_format( 

131 self.output_format, container_io_dict["out"]["output_path"], out_log 

132 ) 

133 oformat = "-o" + oufr 

134 instructions_list.append(oformat) 

135 opath = "-O" + container_io_dict["out"]["output_path"] 

136 instructions_list.append(opath) 

137 

138 # adding coordinates 

139 crd = get_coordinates(self.coordinates, out_log) 

140 coordinates = "" 

141 if crd: 

142 coordinates = "--gen" + crd + "d" 

143 instructions_list.append(coordinates) 

144 

145 hydrogens = "-d" 

146 

147 instructions_list.append(hydrogens) 

148 

149 # adding pH 

150 p = get_ph(self.ph, out_log) 

151 ph = "" 

152 if p: 

153 ph = "-p " + p 

154 instructions_list.append(ph) 

155 

156 # fs_input 

157 if self.fs_input is not None: 

158 for fsi in self.fs_input: 

159 instructions_list.append("-a" + fsi) 

160 

161 # fs_output 

162 if self.fs_output is not None: 

163 for fso in self.fs_output: 

164 instructions_list.append("-x" + fso) 

165 

166 # adding effort (only for 3D coordinates) 

167 if crd == "3": 

168 instructions_list.append("--" + self.effort) 

169 

170 return instructions_list 

171 

172 @launchlogger 

173 def launch(self) -> int: 

174 """Execute the :class:`BabelRemoveHydrogens <babelm.babel_remove_hydrogens.BabelRemoveHydrogens>` babelm.babel_remove_hydrogens.BabelRemoveHydrogens object.""" 

175 

176 # check input/output paths and parameters 

177 self.check_data_params(self.out_log, self.err_log) 

178 

179 # Setup Biobb 

180 if self.check_restart(): 

181 return 0 

182 self.stage_files() 

183 

184 # create command line instruction 

185 self.cmd = self.create_cmd(self.stage_io_dict, self.out_log, self.err_log) 

186 

187 # Run Biobb block 

188 self.run_biobb() 

189 

190 # Copy files to host 

191 self.copy_to_host() 

192 

193 # remove temporary folder(s) 

194 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

195 self.remove_tmp_files() 

196 

197 self.check_arguments(output_files_created=True, raise_exception=False) 

198 

199 return self.return_code 

200 

201 

202def babel_remove_hydrogens( 

203 input_path: str, output_path: str, properties: Optional[dict] = None, **kwargs 

204) -> int: 

205 """Execute the :class:`BabelRemoveHydrogens <babelm.babel_remove_hydrogens.BabelRemoveHydrogens>` class and 

206 execute the :meth:`launch() <babelm.babel_remove_hydrogens.BabelRemoveHydrogens.launch>` method.""" 

207 

208 return BabelRemoveHydrogens( 

209 input_path=input_path, output_path=output_path, properties=properties, **kwargs 

210 ).launch() 

211 

212 babel_remove_hydrogens.__doc__ = BabelRemoveHydrogens.__doc__ 

213 

214 

215def main(): 

216 """Command line execution of this building block. Please check the command line documentation.""" 

217 parser = argparse.ArgumentParser( 

218 description="Removes hydrogen atoms to small molecules.", 

219 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

220 ) 

221 parser.add_argument("--config", required=False, help="Configuration file") 

222 

223 # Specific args of each building block 

224 required_args = parser.add_argument_group("required arguments") 

225 required_args.add_argument( 

226 "--input_path", 

227 required=True, 

228 help="Path to the input file. Accepted formats: dat, ent, fa, fasta, gro, inp, log, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt, xml, xtc.", 

229 ) 

230 required_args.add_argument( 

231 "--output_path", 

232 required=True, 

233 help="Path to the output file. Accepted formats: ent, fa, fasta, gro, inp, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt.", 

234 ) 

235 

236 args = parser.parse_args() 

237 args.config = args.config or "{}" 

238 properties = settings.ConfReader(config=args.config).get_prop_dic() 

239 

240 # Specific call of each building block 

241 babel_remove_hydrogens( 

242 input_path=args.input_path, output_path=args.output_path, properties=properties 

243 ) 

244 

245 

246if __name__ == "__main__": 

247 main()