Coverage for biobb_pytorch / mdae / mdfeaturizer.py: 91%

106 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-02-02 16:33 +0000

1#!/usr/bin/env python3 

2 

3import os 

4import torch 

5from biobb_pytorch.mdae.featurization.topology_selector import MDTopologySelector 

6from biobb_pytorch.mdae.featurization.featurizer import Featurizer 

7from biobb_common.generic.biobb_object import BiobbObject 

8from biobb_common.tools import file_utils as fu 

9from biobb_common.tools.file_utils import launchlogger 

10import numpy as np 

11from typing import Optional, Dict, Any 

12from biobb_pytorch.mdae.utils.log_utils import get_size 

13 

14 

15class MDFeaturePipeline(BiobbObject): 

16 """ 

17 | biobb_pytorch MDFeaturizer 

18 | Obtain the Molecular Dynamics Features for PyTorch model training. 

19 | Obtain the Molecular Dynamics Features for PyTorch model training. 

20 

21 Args: 

22 input_trajectory_path (str) (Optional): Path to the input trajectory file (if omitted topology file is used as trajectory). File type: input. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/data/mdae/train_mdae_traj.xtc>`_. Accepted formats: xtc (edam:format_3875), dcd (edam:format_3878). 

23 input_topology_path (str): Path to the input topology file. File type: input. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/data/mdae/MCV1900209.pdb>`_. Accepted formats: pdb (edam:format_2333). 

24 output_dataset_pt_path (str): Path to the output dataset model file. File type: output. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/ref_output_dataset.pt>`_. Accepted formats: pt (edam:format_2333). 

25 output_stats_pt_path (str): Path to the output model statistics file. File type: output. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/ref_output_stats.pt>`_. Accepted formats: pt (edam:format_2333). 

26 properties (dict - Python dictionary object containing the tool parameters, not input/output files): 

27 * **cartesian** (*dict*) - ({"selection": "name CA"}) Atom selection options for Cartesian coordinates feature generation (e.g. selection, fit_selection). 

28 * **distances** (*dict*) - ({"selection": "name CA", "cutoff": 0.4, "periodic": True, "bonded": False}) Atom selection options for pairwise distance features (selection, cutoff, periodic, bonded, etc.). 

29 * **angles** (*dict*) - ({"selection": "backbone", "periodic": True, "bonded": True}) Atom selection options for angle features (selection, periodic, bonded, etc.). 

30 * **dihedrals** (*dict*) - ({"selection": "backbone", "periodic": True, "bonded": True}) Atom selection options for dihedral features (selection, periodic, bonded, etc.). 

31 * **options** (*dict*) - ({"norm_in": {"mode": "min_max"}}) General processing options (e.g. timelag, norm_in). 

32 

33 Examples: 

34 This is a use case of how to use the building block from Python:: 

35 

36 from biobb_pytorch.mdae.MDFeaturePipeline import MDFeaturizer 

37 

38 prop = { 

39 'cartesian': {'selection': 'name CA'}, 

40 'distances': {'selection': 'name CA', 

41 'cutoff': 0.4, 

42 'periodic': True, 

43 'bonded': False}, 

44 'angles': {'selection': 'backbone', 

45 'periodic': True, 

46 'bonded': True}, 

47 'dihedrals': {'selection': 'backbone', 

48 'periodic': True, 

49 'bonded': True}, 

50 'options': {'timelag': 10, 

51 'norm_in': {'mode': 'min_max'} 

52 } 

53 } 

54 

55 MDFeaturizer(input_trajectory_path=trajectory_file, 

56 input_topology_path=topology_file, 

57 output_dataset_pt_path=output_file, 

58 output_stats_pt_path=output_stats_file, 

59 properties=prop) 

60 

61 Info: 

62 * wrapped_software: 

63 * name: PyTorch 

64 * version: >=1.6.0 

65 * license: BSD 3-Clause 

66 * ontology: 

67 * name: EDAM 

68 * schema: http://edamontology.org/EDAM.owl 

69 """ 

70 

71 def __init__( 

72 self, 

73 input_topology_path: str, 

74 output_dataset_pt_path: str, 

75 output_stats_pt_path: str, 

76 properties: dict, 

77 input_trajectory_path: Optional[str] = None, 

78 input_labels_npy_path: Optional[str] = None, 

79 input_weights_npy_path: Optional[str] = None, 

80 **kwargs, 

81 ) -> None: 

82 

83 properties = properties or {} 

84 

85 super().__init__(properties) 

86 

87 self.input_trajectory_path = input_trajectory_path or input_topology_path 

88 self.input_topology_path = input_topology_path 

89 self.input_labels_npy_path = input_labels_npy_path 

90 self.input_weights_npy_path = input_weights_npy_path 

91 self.output_dataset_pt_path = output_dataset_pt_path 

92 self.output_stats_pt_path = output_stats_pt_path 

93 self.config = properties.copy() 

94 self.locals_var_dict = locals().copy() 

95 

96 # Input/Output files 

97 self.io_dict = { 

98 "in": { 

99 "input_trajectory_path": input_trajectory_path, 

100 "input_topology_path": input_topology_path, 

101 "input_labels_npy_path": input_labels_npy_path, 

102 "input_weights_npy_path": input_weights_npy_path, 

103 }, 

104 "out": { 

105 "output_dataset_pt_path": output_dataset_pt_path, 

106 "output_stats_pt_path": output_stats_pt_path, 

107 }, 

108 } 

109 

110 # build the per-feature arguments 

111 self.feature_types = ["cartesian", "distances", "angles", "dihedrals"] 

112 self.cartesian: dict = properties.get("cartesian", {"selection": "name CA"}) 

113 self.distances: dict = properties.get("distances", {"selection": "name CA", "cutoff": 0.4, "periodic": True, "bonded": False}) 

114 self.angles: dict = properties.get("angles", {"selection": "backbone", "periodic": True, "bonded": True}) 

115 self.dihedrals: dict = properties.get("dihedrals", {"selection": "backbone", "periodic": True, "bonded": True}) 

116 self.options: dict = properties.get("options", {"norm_in": {"mode": "min_max"}}) 

117 

118 # Check the properties 

119 self.check_properties(properties) 

120 self.check_arguments() 

121 

122 # Topology indices 

123 self.topology_indices() 

124 

125 # Featurizer 

126 self.featurize_trajectory() 

127 

128 @launchlogger 

129 def topology_indices(self) -> Dict[str, Any]: 

130 

131 fu.log("## BioBB Featurization - MDFeaturePipeline ##", self.out_log) 

132 

133 fu.log(f"Obtaining the topology information from {self.input_topology_path}", self.out_log) 

134 

135 self.topology = MDTopologySelector(self.input_topology_path) 

136 self.features_idx_dict = self.topology.topology_indexing(self.config) 

137 

138 fu.log("Available Topology Properties:", self.out_log) 

139 fu.log(f" - Number of chains: {self.topology.topology.n_chains}", self.out_log) 

140 fu.log(f" - Number of residues: {self.topology.topology.n_residues}", self.out_log) 

141 fu.log(f" - Number of atoms: {self.topology.n_atoms}", self.out_log) 

142 try: 

143 fu.log(f" - Number of distances: {self.topology.n_distances}", self.out_log) 

144 except AttributeError: 

145 fu.log(" - Number of distances: N/A", self.out_log) 

146 try: 

147 fu.log(f" - Number of angles: {self.topology.n_angles}", self.out_log) 

148 except AttributeError: 

149 fu.log(" - Number of angles: N/A", self.out_log) 

150 try: 

151 fu.log(f" - Number of dihedrals: {self.topology.n_dihedrals}", self.out_log) 

152 except AttributeError: 

153 fu.log(" - Number of dihedrals: N/A", self.out_log) 

154 

155 @launchlogger 

156 def featurize_trajectory(self) -> None: 

157 

158 self.featurizer = Featurizer(self.input_trajectory_path, 

159 self.input_topology_path, 

160 self.input_labels_npy_path, 

161 self.input_weights_npy_path, 

162 ) 

163 

164 fu.log("Available Trajectory Properties:", self.out_log) 

165 fu.log(f" - Number of frames: {self.featurizer.trajectory.n_frames}", self.out_log) 

166 

167 fu.log(f"Featurizing the trajectory {self.input_trajectory_path}", self.out_log) 

168 

169 self.dataset, self.stats = self.featurizer.compute_features(self.features_idx_dict) 

170 

171 if self.input_labels_npy_path: 

172 fu.log(f"Loading labels from {self.input_labels_npy_path}", self.out_log) 

173 self.dataset['labels'] = np.load(self.input_labels_npy_path) 

174 

175 if self.input_weights_npy_path: 

176 fu.log(f"Loading weights from {self.input_weights_npy_path}", self.out_log) 

177 self.dataset['weights'] = np.load(self.input_weights_npy_path) 

178 

179 fu.log("Features:", self.out_log) 

180 for feature_type in self.feature_types: 

181 try: 

182 selection = getattr(self, feature_type).get("selection") 

183 shape = self.featurizer.features.get(feature_type, np.zeros((0, 0))).shape[1] 

184 fu.log(f" {feature_type.capitalize()}:", self.out_log) 

185 fu.log(f" - Topology Selection: {selection}", self.out_log) 

186 fu.log(f" - Number of features: {shape}", self.out_log) 

187 except AttributeError: 

188 pass 

189 

190 fu.log("Postprocessing:", self.out_log) 

191 fu.log(f" - Normalization: {self.options.get('norm_in', {}).get('mode')}", self.out_log) 

192 fu.log(f" - Timelag: {self.options.get('timelag', {})}", self.out_log) 

193 fu.log("Dataset Properties:", self.out_log) 

194 fu.log(f" - Dataset: {self.dataset.keys()}", self.out_log) 

195 fu.log(f" - Number of frames: {self.dataset['data'].shape[0]}", self.out_log) 

196 fu.log(f" - Number of features: {self.dataset['data'].shape[1]}", self.out_log) 

197 

198 @launchlogger 

199 def launch(self) -> int: 

200 """ 

201 Execute the :class:`MDFeaturePipeline <MDFeaturePipeline.MDFeaturePipeline>` object 

202 """ 

203 

204 # Setup Biobb 

205 if self.check_restart(): 

206 return 0 

207 

208 self.stage_files() 

209 

210 torch.save(self.dataset, 

211 self.output_dataset_pt_path) 

212 

213 fu.log(f'Dataset saved in .pt format in {os.path.abspath(self.io_dict["out"]["output_dataset_pt_path"])}', 

214 self.out_log, 

215 ) 

216 fu.log(f'File size: {get_size(self.io_dict["out"]["output_dataset_pt_path"])}', 

217 self.out_log, 

218 ) 

219 

220 torch.save(self.stats, 

221 os.path.splitext(self.output_stats_pt_path)[0] + ".pt") 

222 

223 fu.log(f'Dataset statistics saved in .pt format in {os.path.abspath(self.io_dict["out"]["output_stats_pt_path"])}', 

224 self.out_log, 

225 ) 

226 fu.log(f'File size: {get_size(self.io_dict["out"]["output_stats_pt_path"])}', 

227 self.out_log, 

228 ) 

229 

230 # Copy files to host 

231 self.copy_to_host() 

232 

233 # Remove temporal files 

234 self.remove_tmp_files() 

235 

236 self.check_arguments(output_files_created=True, raise_exception=False) 

237 

238 return 0 

239 

240 

241def MDFeaturizer( 

242 input_topology_path: str, 

243 output_dataset_pt_path: str, 

244 output_stats_pt_path: str, 

245 properties: dict, 

246 input_trajectory_path: Optional[str] = None, 

247 input_labels_npy_path: Optional[str] = None, 

248 input_weights_npy_path: Optional[str] = None, 

249 **kwargs, 

250) -> int: 

251 """Create the :class:`MDFeaturePipeline <MDFeaturePipeline.MDFeaturePipeline>` class and 

252 execute the :meth:`launch() <MDFeaturePipeline.MDFeaturizer.launch>` method.""" 

253 return MDFeaturePipeline(**dict(locals())).launch() 

254 

255 

256MDFeaturizer.__doc__ = MDFeaturePipeline.__doc__ 

257main = MDFeaturePipeline.get_main(MDFeaturizer, "Obtain the Molecular Dynamics Features for PyTorch model training.") 

258 

259if __name__ == "__main__": 

260 main()