Coverage for biobb_chemistry / babelm / babel_convert.py: 94%

79 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-22 12:49 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the BabelConvert class and the command line interface.""" 

4 

5from typing import Optional 

6 

7from biobb_common.generic.biobb_object import BiobbObject 

8from biobb_common.tools.file_utils import launchlogger 

9 

10from biobb_chemistry.babelm.common import ( 

11 _from_string_to_list, 

12 check_input_path, 

13 check_output_path, 

14 get_coordinates, 

15 get_input_format, 

16 get_output_format, 

17 get_ph, 

18) 

19 

20 

21class BabelConvert(BiobbObject): 

22 """ 

23 | biobb_chemistry BabelConvert 

24 | This class is a wrapper of the Open Babel tool. 

25 | Small molecule format conversion for structures or trajectories. Open Babel is a chemical toolbox designed to speak the many languages of chemical data. It's an open, collaborative project allowing anyone to search, convert, analyze, or store data from molecular modeling, chemistry, solid-state materials, biochemistry, or related areas. `Visit the official page <http://openbabel.org/wiki/Main_Page>`_. 

26 

27 Args: 

28 input_path (str): Path to the input file. File type: input. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/data/babelm/babel.smi>`_. Accepted formats: dat (edam:format_1637), ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), log (edam:format_2030), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033), xml (edam:format_2332), xtc (edam:format_3875). 

29 output_path (str): Path to the output file. File type: output. `Sample file <https://github.com/bioexcel/biobb_chemistry/raw/master/biobb_chemistry/test/reference/babelm/ref_babel.convert.mol2>`_. Accepted formats: ent (edam:format_1476), fa (edam:format_1929), fasta (edam:format_1929), gro (edam:format_2033), inp (edam:format_3878), mcif (edam:format_1477), mdl (edam:format_3815), mmcif (edam:format_1477), mol (edam:format_3815), mol2 (edam:format_3816), pdb (edam:format_1476), pdbqt (edam:format_1476), png (edam:format_3603), sdf (edam:format_3814), smi (edam:format_1196), smiles (edam:format_1196), txt (edam:format_2033). 

30 properties (dic - Python dictionary object containing the tool parameters, not input/output files): 

31 * **input_format** (*str*) - (None) Format of input file. If not provided, input_path extension will be taken. Values: dat (Information represented in a data record), ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), log (Events file), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xml (eXtensible Markup Language), xtc (Portable binary format for trajectories produced by GROMACS package). 

32 * **output_format** (*str*) - (None) Format of output file. If not provided, output_path extension will be taken. Values: ent (Protein Data Bank format), fa (FASTA sequence format), fasta (FASTA sequence format), gro (GROMACS structure), inp (AMBER trajectory format), mcif (Entry format of PDB database in mmCIF format), mdl (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mmcif (Entry format of PDB database in mmCIF format), mol (file format for holding information about the atoms; bonds; connectivity and coordinates of a molecule), mol2 (Complete and portable representation of a SYBYL molecule), pdb (Protein Data Bank format), pdbqt (Protein Data Bank format with charges), png (File format for image compression), sdf (One of a family of chemical-data file formats developed by MDL Information Systems), smi (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), smiles (Chemical structure specified in Simplified Molecular Input Line Entry System line notation.), txt (Textual format), xtc (Portable binary format for trajectories produced by GROMACS package). 

33 * **fs_input** (*list*) - (None) Format-specific input options. Values: b (disable automatic bonding), d (input file is in dlg -AutoDock docking log- format). 

34 * **fs_output** (*list*) - (None) Format-specific output options. Values: b (enable automatic bonding), r (output as a rigid molecule), c (combine separate molecular pieces of input into a single rigid molecule), s (output as a flexible residue), p (preserve atom indices from input file), h (preserve hydrogens), n (preserve atom names). 

35 * **coordinates** (*int*) - (None) Type of coordinates: 2D or 3D. Values: 2 (2D coordinates), 3 (3D coordinates). 

36 * **effort** (*str*) - ("medium") Computational effort wanted to dedicate for the conformer generation coordinates calculations, only for 3D coordinates. Values: fastest (only generate coordinates, no force field or conformer search), fast (perform quick forcefield optimization), medium (forcefield optimization + fast conformer search), better (more optimization + fast conformer search), best (more optimization + significant conformer search). 

37 * **ph** (*float*) - (7.4) [0~14|0.1] Add hydrogens appropriate for pH. 

38 * **flex** (*bool*) - (False) Remove all but the largest contiguous fragment (strip salts). 

39 * **binary_path** (*str*) - ("obabel") Path to the obabel executable binary. 

40 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

41 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

42 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

43 * **container_path** (*str*) - (None) Container path definition. 

44 * **container_image** (*str*) - ('informaticsmatters/obabel:latest') Container image definition. 

45 * **container_volume_path** (*str*) - ('/tmp') Container volume path definition. 

46 * **container_working_dir** (*str*) - (None) Container working directory definition. 

47 * **container_user_id** (*str*) - (None) Container user_id definition. 

48 * **container_shell_path** (*str*) - ('/bin/bash') Path to default shell inside the container. 

49 

50 Examples: 

51 This is a use example of how to use the building block from Python:: 

52 

53 from biobb_chemistry.babelm.babel_convert import babel_convert 

54 prop = { 

55 'input_format': 'smi', 

56 'output_format': 'mol2', 

57 'coordinates': 3, 

58 'ph': 7.4 

59 } 

60 babel_convert(input_path='/path/to/my2DMolecule.smi', 

61 output_path='/path/to/new3DMolecule.mol2', 

62 properties=prop) 

63 

64 Info: 

65 * wrapped_software: 

66 * name: Open Babel 

67 * version: 2.4.1 

68 * license: GNU 

69 * ontology: 

70 * name: EDAM 

71 * schema: http://edamontology.org/EDAM.owl 

72 

73 """ 

74 

75 def __init__(self, input_path, output_path, properties=None, **kwargs) -> None: 

76 properties = properties or {} 

77 

78 # Call parent class constructor 

79 super().__init__(properties) 

80 self.locals_var_dict = locals().copy() 

81 

82 # Input/Output files 

83 self.io_dict = { 

84 "in": {"input_path": input_path}, 

85 "out": {"output_path": output_path}, 

86 } 

87 

88 # Properties specific for BB 

89 self.input_format = properties.get("input_format", "") 

90 self.output_format = properties.get("output_format", "") 

91 self.fs_input = _from_string_to_list(properties.get("fs_input", None)) 

92 self.fs_output = _from_string_to_list(properties.get("fs_output", None)) 

93 self.coordinates = properties.get("coordinates", "") 

94 self.effort = properties.get("effort", "medium") 

95 self.ph = properties.get("ph", "") 

96 self.flex = properties.get("flex", False) 

97 self.binary_path = properties.get("binary_path", "obabel") 

98 self.properties = properties 

99 

100 # Check the properties 

101 self.check_properties(properties) 

102 self.check_arguments() 

103 

104 def check_data_params(self, out_log, err_log): 

105 """Checks all the input/output paths and parameters""" 

106 self.io_dict["in"]["input_path"] = check_input_path( 

107 self.io_dict["in"]["input_path"], out_log, self.__class__.__name__ 

108 ) 

109 self.io_dict["out"]["output_path"] = check_output_path( 

110 self.io_dict["out"]["output_path"], out_log, self.__class__.__name__ 

111 ) 

112 

113 def create_cmd(self, container_io_dict, out_log, err_log): 

114 """Creates the command line instruction using the properties file settings""" 

115 instructions_list = [] 

116 

117 # executable path 

118 instructions_list.append(self.binary_path) 

119 

120 # generating input 

121 infr = get_input_format( 

122 self.input_format, container_io_dict["in"]["input_path"], out_log 

123 ) 

124 iformat = "-i" + infr 

125 instructions_list.append(iformat) 

126 ipath = container_io_dict["in"]["input_path"] 

127 instructions_list.append(ipath) 

128 

129 # generating output 

130 oufr = get_output_format( 

131 self.output_format, container_io_dict["out"]["output_path"], out_log 

132 ) 

133 oformat = "-o" + oufr 

134 instructions_list.append(oformat) 

135 opath = "-O" + container_io_dict["out"]["output_path"] 

136 instructions_list.append(opath) 

137 

138 # adding coordinates 

139 crd = get_coordinates(self.coordinates, out_log) 

140 coordinates = "" 

141 if crd: 

142 coordinates = "--gen" + crd + "d" 

143 instructions_list.append(coordinates) 

144 

145 # adding pH 

146 p = get_ph(self.ph, out_log) 

147 ph = "" 

148 if p: 

149 ph = "-p " + p 

150 instructions_list.append(ph) 

151 

152 # flex 

153 flex = "" 

154 if not self.flex: 

155 flex = "-r" 

156 instructions_list.append(flex) 

157 

158 # fs_input 

159 if self.fs_input is not None: 

160 for fsi in self.fs_input: 

161 instructions_list.append("-a" + fsi) 

162 

163 # fs_output 

164 if self.fs_output is not None: 

165 for fso in self.fs_output: 

166 instructions_list.append("-x" + fso) 

167 

168 # adding effort (only for 3D coordinates) 

169 if crd == "3": 

170 instructions_list.append("--" + self.effort) 

171 

172 return instructions_list 

173 

174 @launchlogger 

175 def launch(self) -> int: 

176 """Execute the :class:`BabelConvert <babelm.babel_convert.BabelConvert>` babelm.babel_convert.BabelConvert object.""" 

177 

178 # check input/output paths and parameters 

179 self.check_data_params(self.out_log, self.err_log) 

180 

181 # Setup Biobb 

182 if self.check_restart(): 

183 return 0 

184 self.stage_files() 

185 

186 # create command line instruction 

187 self.cmd = self.create_cmd(self.stage_io_dict, self.out_log, self.err_log) 

188 

189 # Run Biobb block 

190 self.run_biobb() 

191 

192 # Copy files to host 

193 self.copy_to_host() 

194 

195 # remove temporary folder(s) 

196 self.remove_tmp_files() 

197 

198 self.check_arguments(output_files_created=True, raise_exception=False) 

199 

200 return self.return_code 

201 

202 

203def babel_convert( 

204 input_path: str, output_path: str, properties: Optional[dict] = None, **kwargs 

205) -> int: 

206 """Create the :class:`BabelConvert <babelm.babel_convert.BabelConvert>` class and 

207 execute the :meth:`launch() <babelm.babel_convert.BabelConvert.launch>` method.""" 

208 return BabelConvert(**dict(locals())).launch() 

209 

210 

211babel_convert.__doc__ = BabelConvert.__doc__ 

212main = BabelConvert.get_main(babel_convert, "Small molecule format conversion.") 

213 

214 

215if __name__ == "__main__": 

216 main()