Coverage for biobb_pytorch / mdae / mdfeaturizer.py: 91%
106 statements
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-02 16:33 +0000
« prev ^ index » next coverage.py v7.13.2, created at 2026-02-02 16:33 +0000
1#!/usr/bin/env python3
3import os
4import torch
5from biobb_pytorch.mdae.featurization.topology_selector import MDTopologySelector
6from biobb_pytorch.mdae.featurization.featurizer import Featurizer
7from biobb_common.generic.biobb_object import BiobbObject
8from biobb_common.tools import file_utils as fu
9from biobb_common.tools.file_utils import launchlogger
10import numpy as np
11from typing import Optional, Dict, Any
12from biobb_pytorch.mdae.utils.log_utils import get_size
15class MDFeaturePipeline(BiobbObject):
16 """
17 | biobb_pytorch MDFeaturizer
18 | Obtain the Molecular Dynamics Features for PyTorch model training.
19 | Obtain the Molecular Dynamics Features for PyTorch model training.
21 Args:
22 input_trajectory_path (str) (Optional): Path to the input trajectory file (if omitted topology file is used as trajectory). File type: input. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/data/mdae/train_mdae_traj.xtc>`_. Accepted formats: xtc (edam:format_3875), dcd (edam:format_3878).
23 input_topology_path (str): Path to the input topology file. File type: input. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/data/mdae/MCV1900209.pdb>`_. Accepted formats: pdb (edam:format_2333).
24 output_dataset_pt_path (str): Path to the output dataset model file. File type: output. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/ref_output_dataset.pt>`_. Accepted formats: pt (edam:format_2333).
25 output_stats_pt_path (str): Path to the output model statistics file. File type: output. `Sample file <https://github.com/bioexcel/biobb_pytorch/raw/master/biobb_pytorch/test/reference/mdae/ref_output_stats.pt>`_. Accepted formats: pt (edam:format_2333).
26 properties (dict - Python dictionary object containing the tool parameters, not input/output files):
27 * **cartesian** (*dict*) - ({"selection": "name CA"}) Atom selection options for Cartesian coordinates feature generation (e.g. selection, fit_selection).
28 * **distances** (*dict*) - ({"selection": "name CA", "cutoff": 0.4, "periodic": True, "bonded": False}) Atom selection options for pairwise distance features (selection, cutoff, periodic, bonded, etc.).
29 * **angles** (*dict*) - ({"selection": "backbone", "periodic": True, "bonded": True}) Atom selection options for angle features (selection, periodic, bonded, etc.).
30 * **dihedrals** (*dict*) - ({"selection": "backbone", "periodic": True, "bonded": True}) Atom selection options for dihedral features (selection, periodic, bonded, etc.).
31 * **options** (*dict*) - ({"norm_in": {"mode": "min_max"}}) General processing options (e.g. timelag, norm_in).
33 Examples:
34 This is a use case of how to use the building block from Python::
36 from biobb_pytorch.mdae.MDFeaturePipeline import MDFeaturizer
38 prop = {
39 'cartesian': {'selection': 'name CA'},
40 'distances': {'selection': 'name CA',
41 'cutoff': 0.4,
42 'periodic': True,
43 'bonded': False},
44 'angles': {'selection': 'backbone',
45 'periodic': True,
46 'bonded': True},
47 'dihedrals': {'selection': 'backbone',
48 'periodic': True,
49 'bonded': True},
50 'options': {'timelag': 10,
51 'norm_in': {'mode': 'min_max'}
52 }
53 }
55 MDFeaturizer(input_trajectory_path=trajectory_file,
56 input_topology_path=topology_file,
57 output_dataset_pt_path=output_file,
58 output_stats_pt_path=output_stats_file,
59 properties=prop)
61 Info:
62 * wrapped_software:
63 * name: PyTorch
64 * version: >=1.6.0
65 * license: BSD 3-Clause
66 * ontology:
67 * name: EDAM
68 * schema: http://edamontology.org/EDAM.owl
69 """
71 def __init__(
72 self,
73 input_topology_path: str,
74 output_dataset_pt_path: str,
75 output_stats_pt_path: str,
76 properties: dict,
77 input_trajectory_path: Optional[str] = None,
78 input_labels_npy_path: Optional[str] = None,
79 input_weights_npy_path: Optional[str] = None,
80 **kwargs,
81 ) -> None:
83 properties = properties or {}
85 super().__init__(properties)
87 self.input_trajectory_path = input_trajectory_path or input_topology_path
88 self.input_topology_path = input_topology_path
89 self.input_labels_npy_path = input_labels_npy_path
90 self.input_weights_npy_path = input_weights_npy_path
91 self.output_dataset_pt_path = output_dataset_pt_path
92 self.output_stats_pt_path = output_stats_pt_path
93 self.config = properties.copy()
94 self.locals_var_dict = locals().copy()
96 # Input/Output files
97 self.io_dict = {
98 "in": {
99 "input_trajectory_path": input_trajectory_path,
100 "input_topology_path": input_topology_path,
101 "input_labels_npy_path": input_labels_npy_path,
102 "input_weights_npy_path": input_weights_npy_path,
103 },
104 "out": {
105 "output_dataset_pt_path": output_dataset_pt_path,
106 "output_stats_pt_path": output_stats_pt_path,
107 },
108 }
110 # build the per-feature arguments
111 self.feature_types = ["cartesian", "distances", "angles", "dihedrals"]
112 self.cartesian: dict = properties.get("cartesian", {"selection": "name CA"})
113 self.distances: dict = properties.get("distances", {"selection": "name CA", "cutoff": 0.4, "periodic": True, "bonded": False})
114 self.angles: dict = properties.get("angles", {"selection": "backbone", "periodic": True, "bonded": True})
115 self.dihedrals: dict = properties.get("dihedrals", {"selection": "backbone", "periodic": True, "bonded": True})
116 self.options: dict = properties.get("options", {"norm_in": {"mode": "min_max"}})
118 # Check the properties
119 self.check_properties(properties)
120 self.check_arguments()
122 # Topology indices
123 self.topology_indices()
125 # Featurizer
126 self.featurize_trajectory()
128 @launchlogger
129 def topology_indices(self) -> Dict[str, Any]:
131 fu.log("## BioBB Featurization - MDFeaturePipeline ##", self.out_log)
133 fu.log(f"Obtaining the topology information from {self.input_topology_path}", self.out_log)
135 self.topology = MDTopologySelector(self.input_topology_path)
136 self.features_idx_dict = self.topology.topology_indexing(self.config)
138 fu.log("Available Topology Properties:", self.out_log)
139 fu.log(f" - Number of chains: {self.topology.topology.n_chains}", self.out_log)
140 fu.log(f" - Number of residues: {self.topology.topology.n_residues}", self.out_log)
141 fu.log(f" - Number of atoms: {self.topology.n_atoms}", self.out_log)
142 try:
143 fu.log(f" - Number of distances: {self.topology.n_distances}", self.out_log)
144 except AttributeError:
145 fu.log(" - Number of distances: N/A", self.out_log)
146 try:
147 fu.log(f" - Number of angles: {self.topology.n_angles}", self.out_log)
148 except AttributeError:
149 fu.log(" - Number of angles: N/A", self.out_log)
150 try:
151 fu.log(f" - Number of dihedrals: {self.topology.n_dihedrals}", self.out_log)
152 except AttributeError:
153 fu.log(" - Number of dihedrals: N/A", self.out_log)
155 @launchlogger
156 def featurize_trajectory(self) -> None:
158 self.featurizer = Featurizer(self.input_trajectory_path,
159 self.input_topology_path,
160 self.input_labels_npy_path,
161 self.input_weights_npy_path,
162 )
164 fu.log("Available Trajectory Properties:", self.out_log)
165 fu.log(f" - Number of frames: {self.featurizer.trajectory.n_frames}", self.out_log)
167 fu.log(f"Featurizing the trajectory {self.input_trajectory_path}", self.out_log)
169 self.dataset, self.stats = self.featurizer.compute_features(self.features_idx_dict)
171 if self.input_labels_npy_path:
172 fu.log(f"Loading labels from {self.input_labels_npy_path}", self.out_log)
173 self.dataset['labels'] = np.load(self.input_labels_npy_path)
175 if self.input_weights_npy_path:
176 fu.log(f"Loading weights from {self.input_weights_npy_path}", self.out_log)
177 self.dataset['weights'] = np.load(self.input_weights_npy_path)
179 fu.log("Features:", self.out_log)
180 for feature_type in self.feature_types:
181 try:
182 selection = getattr(self, feature_type).get("selection")
183 shape = self.featurizer.features.get(feature_type, np.zeros((0, 0))).shape[1]
184 fu.log(f" {feature_type.capitalize()}:", self.out_log)
185 fu.log(f" - Topology Selection: {selection}", self.out_log)
186 fu.log(f" - Number of features: {shape}", self.out_log)
187 except AttributeError:
188 pass
190 fu.log("Postprocessing:", self.out_log)
191 fu.log(f" - Normalization: {self.options.get('norm_in', {}).get('mode')}", self.out_log)
192 fu.log(f" - Timelag: {self.options.get('timelag', {})}", self.out_log)
193 fu.log("Dataset Properties:", self.out_log)
194 fu.log(f" - Dataset: {self.dataset.keys()}", self.out_log)
195 fu.log(f" - Number of frames: {self.dataset['data'].shape[0]}", self.out_log)
196 fu.log(f" - Number of features: {self.dataset['data'].shape[1]}", self.out_log)
198 @launchlogger
199 def launch(self) -> int:
200 """
201 Execute the :class:`MDFeaturePipeline <MDFeaturePipeline.MDFeaturePipeline>` object
202 """
204 # Setup Biobb
205 if self.check_restart():
206 return 0
208 self.stage_files()
210 torch.save(self.dataset,
211 self.output_dataset_pt_path)
213 fu.log(f'Dataset saved in .pt format in {os.path.abspath(self.io_dict["out"]["output_dataset_pt_path"])}',
214 self.out_log,
215 )
216 fu.log(f'File size: {get_size(self.io_dict["out"]["output_dataset_pt_path"])}',
217 self.out_log,
218 )
220 torch.save(self.stats,
221 os.path.splitext(self.output_stats_pt_path)[0] + ".pt")
223 fu.log(f'Dataset statistics saved in .pt format in {os.path.abspath(self.io_dict["out"]["output_stats_pt_path"])}',
224 self.out_log,
225 )
226 fu.log(f'File size: {get_size(self.io_dict["out"]["output_stats_pt_path"])}',
227 self.out_log,
228 )
230 # Copy files to host
231 self.copy_to_host()
233 # Remove temporal files
234 self.remove_tmp_files()
236 self.check_arguments(output_files_created=True, raise_exception=False)
238 return 0
241def MDFeaturizer(
242 input_topology_path: str,
243 output_dataset_pt_path: str,
244 output_stats_pt_path: str,
245 properties: dict,
246 input_trajectory_path: Optional[str] = None,
247 input_labels_npy_path: Optional[str] = None,
248 input_weights_npy_path: Optional[str] = None,
249 **kwargs,
250) -> int:
251 """Create the :class:`MDFeaturePipeline <MDFeaturePipeline.MDFeaturePipeline>` class and
252 execute the :meth:`launch() <MDFeaturePipeline.MDFeaturizer.launch>` method."""
253 return MDFeaturePipeline(**dict(locals())).launch()
256MDFeaturizer.__doc__ = MDFeaturePipeline.__doc__
257main = MDFeaturePipeline.get_main(MDFeaturizer, "Obtain the Molecular Dynamics Features for PyTorch model training.")
259if __name__ == "__main__":
260 main()