Coverage for biobb_chemistry/babelm/babel_convert.py: 83%

90 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-12 09:28 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the BabelConvert class and the command line interface.""" 

4 

5import argparse 

6from typing import Optional 

7 

8from biobb_common.configuration import settings 

9from biobb_common.generic.biobb_object import BiobbObject 

10from biobb_common.tools.file_utils import launchlogger 

11 

12from biobb_chemistry.babelm.common import ( 

13 _from_string_to_list, 

14 check_input_path, 

15 check_output_path, 

16 get_coordinates, 

17 get_input_format, 

18 get_output_format, 

19 get_ph, 

20) 

21 

22 

23class BabelConvert(BiobbObject): 

24 """ 

25 | biobb_chemistry BabelConvert 

26 | This class is a wrapper of the Open Babel tool. 

27 | Small molecule format conversion for structures or trajectories. Open Babel is a chemical toolbox designed to speak the many languages of chemical data. It's an open, collaborative project allowing anyone to search, convert, analyze, or store data from molecular modeling, chemistry, solid-state materials, biochemistry, or related areas. `Visit the official page <http://openbabel.org/wiki/Main_Page>`_. 

28 

29 Args: 

30 input_path (str): Path to the input file. File type: input. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/data/babel/babel.smi>`_. Accepted formats: dat (edam:format_1637), ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), log (edam:format_2030), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033), xml (edam:format_2332), xtc (edam:format_3875). 

31 output_path (str): Path to the output file. File type: output. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/reference/babel/ref_babel.convert.mol2>`_. Accepted formats: ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033). 

32 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

33 * **input_format** (*str*) - (None) Format of input file. If not provided, input_path extension will be taken. Values: dat (Information represented in a data record), ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), log (Events file), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xml (eXtensible Markup Language), xtc (Portable binary format for trajectories produced by GROMACS package). 

34 * **output_format** (*str*) - (None) Format of output file. If not provided, output_path extension will be taken. Values: ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xtc (Portable binary format for trajectories produced by GROMACS package). 

35 * **fs_input** (*list*) - (None) Format-specific input options. Values: b (disable automatic bonding), d (input file is in dlg -AutoDock docking log- format). 

36 * **fs_output** (*list*) - (None) Format-specific output options. Values: b (enable automatic bonding), r (output as a rigid molecule), c (combine separate molecular pieces of input into a single rigid molecule), s (output as a flexible residue), p (preserve atom indices from input file), h (preserve hydrogens), n (preserve atom names). 

37 * **coordinates** (*int*) - (None) Type of coordinates: 2D or 3D. Values: 2 (2D coordinates), 3 (3D coordinates). 

38 * **effort** (*str*) - ("medium") Computational effort wanted to dedicate for the conformer generation coordinates calculations, only for 3D coordinates. Values: fastest (only generate coordinates, no force field or conformer search), fast (perform quick forcefield optimization), medium (forcefield optimization + fast conformer search), better (more optimization + fast conformer search), best (more optimization + significant conformer search). 

39 * **ph** (*float*) - (7.4) [0~14|0.1] Add hydrogens appropriate for pH. 

40 * **flex** (*bool*) - (False) Remove all but the largest contiguous fragment (strip salts). 

41 * **binary_path** (*str*) - ("obabel") Path to the obabel executable binary. 

42 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

43 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

44 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

45 * **container_path** (*str*) - (None) Container path definition. 

46 * **container_image** (*str*) - ('informaticsmatters/obabel:latest') Container image definition. 

47 * **container_volume_path** (*str*) - ('/tmp') Container volume path definition. 

48 * **container_working_dir** (*str*) - (None) Container working directory definition. 

49 * **container_user_id** (*str*) - (None) Container user_id definition. 

50 * **container_shell_path** (*str*) - ('/bin/bash') Path to default shell inside the container. 

51 

52 Examples: 

53 This is a use example of how to use the building block from Python:: 

54 

55 from biobb_chemistry.babelm.babel_convert import babel_convert 

56 prop = { 

57 'input_format': 'smi', 

58 'output_format': 'mol2', 

59 'coordinates': 3, 

60 'ph': 7.4 

61 } 

62 babel_convert(input_path='/path/to/my2DMolecule.smi', 

63 output_path='/path/to/new3DMolecule.mol2', 

64 properties=prop) 

65 

66 Info: 

67 * wrapped_software: 

68 * name: Open Babel 

69 * version: 2.4.1 

70 * license: GNU 

71 * ontology: 

72 * name: EDAM 

73 * schema: http://edamontology.org/EDAM.owl 

74 

75 """ 

76 

77 def __init__(self, input_path, output_path, properties=None, **kwargs) -> None: 

78 properties = properties or {} 

79 

80 # Call parent class constructor 

81 super().__init__(properties) 

82 self.locals_var_dict = locals().copy() 

83 

84 # Input/Output files 

85 self.io_dict = { 

86 "in": {"input_path": input_path}, 

87 "out": {"output_path": output_path}, 

88 } 

89 

90 # Properties specific for BB 

91 self.input_format = properties.get("input_format", "") 

92 self.output_format = properties.get("output_format", "") 

93 self.fs_input = _from_string_to_list(properties.get("fs_input", None)) 

94 self.fs_output = _from_string_to_list(properties.get("fs_output", None)) 

95 self.coordinates = properties.get("coordinates", "") 

96 self.effort = properties.get("effort", "medium") 

97 self.ph = properties.get("ph", "") 

98 self.flex = properties.get("flex", False) 

99 self.binary_path = properties.get("binary_path", "obabel") 

100 self.properties = properties 

101 

102 # Check the properties 

103 self.check_properties(properties) 

104 self.check_arguments() 

105 

106 def check_data_params(self, out_log, err_log): 

107 """Checks all the input/output paths and parameters""" 

108 self.io_dict["in"]["input_path"] = check_input_path( 

109 self.io_dict["in"]["input_path"], out_log, self.__class__.__name__ 

110 ) 

111 self.io_dict["out"]["output_path"] = check_output_path( 

112 self.io_dict["out"]["output_path"], out_log, self.__class__.__name__ 

113 ) 

114 

115 def create_cmd(self, container_io_dict, out_log, err_log): 

116 """Creates the command line instruction using the properties file settings""" 

117 instructions_list = [] 

118 

119 # executable path 

120 instructions_list.append(self.binary_path) 

121 

122 # generating input 

123 infr = get_input_format( 

124 self.input_format, container_io_dict["in"]["input_path"], out_log 

125 ) 

126 iformat = "-i" + infr 

127 instructions_list.append(iformat) 

128 ipath = container_io_dict["in"]["input_path"] 

129 instructions_list.append(ipath) 

130 

131 # generating output 

132 oufr = get_output_format( 

133 self.output_format, container_io_dict["out"]["output_path"], out_log 

134 ) 

135 oformat = "-o" + oufr 

136 instructions_list.append(oformat) 

137 opath = "-O" + container_io_dict["out"]["output_path"] 

138 instructions_list.append(opath) 

139 

140 # adding coordinates 

141 crd = get_coordinates(self.coordinates, out_log) 

142 coordinates = "" 

143 if crd: 

144 coordinates = "--gen" + crd + "d" 

145 instructions_list.append(coordinates) 

146 

147 # adding pH 

148 p = get_ph(self.ph, out_log) 

149 ph = "" 

150 if p: 

151 ph = "-p " + p 

152 instructions_list.append(ph) 

153 

154 # flex 

155 flex = "" 

156 if not self.flex: 

157 flex = "-r" 

158 instructions_list.append(flex) 

159 

160 # fs_input 

161 if self.fs_input is not None: 

162 for fsi in self.fs_input: 

163 instructions_list.append("-a" + fsi) 

164 

165 # fs_output 

166 if self.fs_output is not None: 

167 for fso in self.fs_output: 

168 instructions_list.append("-x" + fso) 

169 

170 # adding effort (only for 3D coordinates) 

171 if crd == "3": 

172 instructions_list.append("--" + self.effort) 

173 

174 return instructions_list 

175 

176 @launchlogger 

177 def launch(self) -> int: 

178 """Execute the :class:`BabelConvert <babelm.babel_convert.BabelConvert>` babelm.babel_convert.BabelConvert object.""" 

179 

180 # check input/output paths and parameters 

181 self.check_data_params(self.out_log, self.err_log) 

182 

183 # Setup Biobb 

184 if self.check_restart(): 

185 return 0 

186 self.stage_files() 

187 

188 # create command line instruction 

189 self.cmd = self.create_cmd(self.stage_io_dict, self.out_log, self.err_log) 

190 

191 # Run Biobb block 

192 self.run_biobb() 

193 

194 # Copy files to host 

195 self.copy_to_host() 

196 

197 # remove temporary folder(s) 

198 # self.tmp_files.extend([self.stage_io_dict.get("unique_dir", "")]) 

199 self.remove_tmp_files() 

200 

201 self.check_arguments(output_files_created=True, raise_exception=False) 

202 

203 return self.return_code 

204 

205 

206def babel_convert( 

207 input_path: str, output_path: str, properties: Optional[dict] = None, **kwargs 

208) -> int: 

209 """Execute the :class:`BabelConvert <babelm.babel_convert.BabelConvert>` class and 

210 execute the :meth:`launch() <babelm.babel_convert.BabelConvert.launch>` method.""" 

211 

212 return BabelConvert( 

213 input_path=input_path, output_path=output_path, properties=properties, **kwargs 

214 ).launch() 

215 

216 babel_convert.__doc__ = BabelConvert.__doc__ 

217 

218 

219def main(): 

220 """Command line execution of this building block. Please check the command line documentation.""" 

221 parser = argparse.ArgumentParser( 

222 description="Small molecule format conversion.", 

223 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999), 

224 ) 

225 parser.add_argument("--config", required=False, help="Configuration file") 

226 

227 # Specific args of each building block 

228 required_args = parser.add_argument_group("required arguments") 

229 required_args.add_argument( 

230 "--input_path", 

231 required=True, 

232 help="Path to the input file. Accepted formats: dat, ent, fa, fasta, gro, inp, log, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt, xml, xtc.", 

233 ) 

234 required_args.add_argument( 

235 "--output_path", 

236 required=True, 

237 help="Path to the output file. Accepted formats: ent, fa, fasta, gro, inp, mcif, mdl, mmcif, mol, mol2, pdb, pdbqt, png, sdf, smi, smiles, txt.", 

238 ) 

239 

240 args = parser.parse_args() 

241 args.config = args.config or "{}" 

242 properties = settings.ConfReader(config=args.config).get_prop_dic() 

243 

244 # Specific call of each building block 

245 babel_convert( 

246 input_path=args.input_path, output_path=args.output_path, properties=properties 

247 ) 

248 

249 

250if __name__ == "__main__": 

251 main()