Coverage for biobb_dna/dna/dna_timeseries_unzip.py: 67%

86 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-28 10:36 +0000

1#!/usr/bin/env python3 

2 

3"""Module containing the DnaTimeseriesUnzip class and the command line interface.""" 

4import re 

5import zipfile 

6import shutil 

7import argparse 

8from typing import Optional 

9 

10from biobb_dna.utils import constants 

11from biobb_common.generic.biobb_object import BiobbObject 

12from biobb_common.configuration import settings 

13from biobb_common.tools import file_utils as fu 

14from biobb_common.tools.file_utils import launchlogger 

15 

16 

17class DnaTimeseriesUnzip(BiobbObject): 

18 """ 

19 | biobb_dna DnaTimeseriesUnzip 

20 | Tool for extracting dna_timeseries output files. 

21 | Unzips a zip file containing dna_timeseries output files and extracts the csv and jpg files. 

22 

23 Args: 

24 input_zip_file (str): Zip file with dna_timeseries output files. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/dna/timeseries_output.zip>`_. Accepted formats: zip (edam:format_3987). 

25 output_path_csv (str): dna_timeseries output csv file contained within input_zip_file. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/dna_timeseries_unzip.csv>`_. Accepted formats: csv (edam:format_3752). 

26 output_path_jpg (str): dna_timeseries output jpg file contained within input_zip_file. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/dna_timeseries_unzip.jpg>`_. Accepted formats: jpg (edam:format_3579). 

27 output_list_path (str) (Optional): Text file with a list of all dna_timeseries output files contained within input_zip_file. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/dna/dna_timeseries_unzip.txt>`_. Accepted formats: txt (edam:format_2330). 

28 properties (dic): 

29 * **type** (*str*) - (None) Type of analysis, series or histogram. Values: series, hist. 

30 * **parameter** (*str*) - (None) Type of parameter. Values: majd, majw, mind, minw, inclin, tip, xdisp, ydisp, shear, stretch, stagger, buckle, propel, opening, rise, roll, twist, shift, slide, tilt, alphaC, alphaW, betaC, betaW, gammaC, gammaW, deltaC, deltaW, epsilC, epsilW, zetaC, zetaW, chiC, chiW, phaseC, phaseW. 

31 * **sequence** (*str*) - (None) Nucleic acid sequence used for generating dna_timeseries output file. 

32 * **index** (*int*) - (1) Base pair index in the parameter 'sequence', starting from 1. 

33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

35 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.dna.dna_timeseries_unzip import dna_timeseries_unzip 

40 prop = { 

41 'type': 'hist', 

42 'parameter': 'shift', 

43 'sequence': 'CGCGAATTCGCG', 

44 'index': 5 

45 } 

46 dna_timeseries_unzip( 

47 input_zip_file='/path/to/dna_timeseries/output.zip', 

48 output_path='/path/to/output.csv', 

49 output_list_path='/path/to/output.txt' 

50 properties=prop) 

51 Info: 

52 * wrapped_software: 

53 * name: In house 

54 * license: Apache-2.0 

55 * ontology: 

56 * name: EDAM 

57 * schema: http://edamontology.org/EDAM.owl 

58 """ 

59 

60 def __init__(self, input_zip_file, 

61 output_path_csv, output_path_jpg, output_list_path=None, properties=None, **kwargs) -> None: 

62 properties = properties or {} 

63 

64 # Call parent class constructor 

65 super().__init__(properties) 

66 self.locals_var_dict = locals().copy() 

67 

68 # Input/Output files 

69 self.io_dict = { 

70 'in': { 

71 'input_zip_file': input_zip_file 

72 }, 

73 'out': { 

74 'output_path_csv': output_path_csv, 

75 'output_path_jpg': output_path_jpg, 

76 'output_list_path': output_list_path 

77 } 

78 } 

79 

80 # Properties specific for BB 

81 self.type = properties.get('type', None) 

82 self.parameter = properties.get('parameter', None) 

83 self.sequence = properties.get('sequence', None) 

84 self.index = properties.get('index', 1) 

85 self.properties = properties 

86 

87 # Check the properties 

88 self.check_properties(properties) 

89 self.check_arguments() 

90 

91 @launchlogger 

92 def launch(self) -> int: 

93 """Execute the :class:`DnaTimeseriesUnzip <biobb_dna.dna.dna_timeseries_unzip.DnaTimeseriesUnzip>` object.""" 

94 

95 # Setup Biobb 

96 if self.check_restart(): 

97 return 0 

98 self.stage_files() 

99 

100 # Check that both properties are set 

101 if self.type is None or self.parameter is None or self.sequence is None: 

102 fu.log("Properties 'type', 'parameter' and 'sequence' are mandatory to run DnaTimeseriesUnzip. Please set them.", 

103 self.out_log, self.global_log) 

104 exit(1) 

105 

106 # Check that the type is valid 

107 if self.type not in ["series", "hist"]: 

108 fu.log(f"Type {self.type} not valid. Valid types are: series, hist.", 

109 self.out_log, self.global_log) 

110 exit(1) 

111 

112 # Check that the parameter is valid 

113 if self.parameter not in constants.helical_parameters: 

114 fu.log(f"Parameter {self.parameter} not valid. Valid parameters are: {constants.helical_parameters}.", 

115 self.out_log, self.global_log) 

116 exit(1) 

117 

118 # Check that the sequence is valid 

119 pattern = r'^[ACGT]+$' 

120 if not re.match(pattern, self.sequence): 

121 fu.log(f"Sequence {self.sequence} not valid. Only 'A', 'C', 'G' or 'T' bases allowed.", 

122 self.out_log, self.global_log) 

123 exit(1) 

124 

125 # Check that the index is valid 

126 if self.index < 1 or self.index >= len(self.sequence) - 1: 

127 fu.log(f"Index {self.index} not valid. It should be between 0 and {len(self.sequence) - 2}.", 

128 self.out_log, self.global_log) 

129 exit(1) 

130 

131 # Get index sequence base and next base 

132 bp = self.sequence[self.index-1] + self.sequence[self.index] 

133 

134 # Get the filename 

135 filename = f"{self.type}_{self.parameter}_{self.index}_{bp}" 

136 csv_file = f"{filename}.csv" 

137 jpg_file = f"{filename}.jpg" 

138 

139 # Unzip the file 

140 with zipfile.ZipFile(self.stage_io_dict["in"]["input_zip_file"], 'r') as zip_ref: 

141 # Check if the csv file exists in the zip file 

142 if csv_file in zip_ref.namelist(): 

143 # Extract the file 

144 fu.log(f'{csv_file} exists, copying into {self.stage_io_dict["out"]["output_path_csv"]}.', 

145 self.out_log, self.global_log) 

146 with zip_ref.open(csv_file) as source, open(self.stage_io_dict["out"]["output_path_csv"], "wb") as target: 

147 shutil.copyfileobj(source, target) 

148 else: 

149 fu.log(f"File {csv_file} not found in the zip file.", self.out_log, self.global_log) 

150 exit(1) 

151 

152 # Check if the jpg file exists in the zip file 

153 if jpg_file in zip_ref.namelist(): 

154 # Extract the file 

155 fu.log(f'{jpg_file} exists, copying into {self.stage_io_dict["out"]["output_path_jpg"]}.', 

156 self.out_log, self.global_log) 

157 with zip_ref.open(jpg_file) as source, open(self.stage_io_dict["out"]["output_path_jpg"], "wb") as target: 

158 shutil.copyfileobj(source, target) 

159 else: 

160 fu.log(f"File {jpg_file} not found in the zip file.", self.out_log, self.global_log) 

161 exit(1) 

162 

163 # Write the list of files 

164 if self.stage_io_dict["out"]["output_list_path"]: 

165 with open(self.stage_io_dict["out"]["output_list_path"], "w") as f: 

166 for name in zip_ref.namelist(): 

167 f.write(f"{name}\n") 

168 

169 # Run Biobb block 

170 # self.run_biobb() 

171 

172 # Copy files to host 

173 self.copy_to_host() 

174 

175 # Remove temporary file(s) 

176 # self.tmp_files.extend([ 

177 # self.stage_io_dict.get("unique_dir", "") 

178 # ]) 

179 self.remove_tmp_files() 

180 

181 self.check_arguments(output_files_created=True, raise_exception=False) 

182 

183 return self.return_code 

184 

185 

186def dna_timeseries_unzip( 

187 input_zip_file: str, 

188 output_path_csv: str, 

189 output_path_jpg: str, 

190 output_list_path: Optional[str] = None, 

191 properties: Optional[dict] = None, 

192 **kwargs) -> int: 

193 """Create :class:`DnaTimeseriesUnzip <biobb_dna.dna.dna_timeseries_unzip.DnaTimeseriesUnzip>` class and 

194 execute the :meth:`launch() <biobb_dna.dna.dna_timeseries_unzip.DnaTimeseriesUnzip.launch>` method.""" 

195 

196 return DnaTimeseriesUnzip( 

197 input_zip_file=input_zip_file, 

198 output_path_csv=output_path_csv, 

199 output_path_jpg=output_path_jpg, 

200 output_list_path=output_list_path, 

201 properties=properties, **kwargs).launch() 

202 

203 dna_timeseries_unzip.__doc__ = DnaTimeseriesUnzip.__doc__ 

204 

205 

206def main(): 

207 """Command line execution of this building block. Please check the command line documentation.""" 

208 parser = argparse.ArgumentParser(description='Tool for extracting dna_timeseries output files.', 

209 formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999)) 

210 parser.add_argument('--config', required=False, help='Configuration file') 

211 

212 required_args = parser.add_argument_group('required arguments') 

213 required_args.add_argument('--input_zip_file', required=True, 

214 help='Zip file with dna_timeseries output files. Accepted formats: zip.') 

215 required_args.add_argument('--output_path_csv', required=True, 

216 help='dna_timeseries output csv file contained within input_zip_file. Accepted formats: csv.') 

217 required_args.add_argument('--output_path_jpg', required=True, 

218 help='dna_timeseries output jpg file contained within input_zip_file. Accepted formats: jpg.') 

219 parser.add_argument('--output_list_path', required=False, 

220 help='Text file with a list of all dna_timeseries output files contained within input_zip_file. Accepted formats: txt.') 

221 

222 args = parser.parse_args() 

223 args.config = args.config or "{}" 

224 properties = settings.ConfReader(config=args.config).get_prop_dic() 

225 

226 dna_timeseries_unzip( 

227 input_zip_file=args.input_zip_file, 

228 output_path_csv=args.output_path_csv, 

229 output_path_jpg=args.output_path_jpg, 

230 output_list_path=args.output_list_path, 

231 properties=properties) 

232 

233 

234if __name__ == '__main__': 

235 main()