Coverage for biobb_dna / backbone / bipopulations.py: 90%

87 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-22 13:05 +0000

1#!/usr/bin/env python3 

2"""Module containing the BIPopulations class and the command line interface.""" 

3 

4from typing import Optional 

5 

6import matplotlib.pyplot as plt 

7import pandas as pd 

8from biobb_common.generic.biobb_object import BiobbObject 

9from biobb_common.tools.file_utils import launchlogger 

10from numpy import nan 

11 

12from biobb_dna.utils.common import _from_string_to_list 

13from biobb_dna.utils.loader import read_series 

14from biobb_dna.utils.transform import inverse_complement 

15 

16 

17class BIPopulations(BiobbObject): 

18 """ 

19 | biobb_dna BIPopulations 

20 | Calculate BI/BII populations from epsilon and zeta parameters. 

21 | Calculate BI/BII populations from epsilon and zeta parameters. 

22 

23 Args: 

24 input_epsilC_path (str): Path to .ser file for helical parameter 'epsilC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilC.ser>`_. Accepted formats: ser (edam:format_2330). 

25 input_epsilW_path (str): Path to .ser file for helical parameter 'epsilW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_epsilW.ser>`_. Accepted formats: ser (edam:format_2330). 

26 input_zetaC_path (str): Path to .ser file for helical parameter 'zetaC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaC.ser>`_. Accepted formats: ser (edam:format_2330). 

27 input_zetaW_path (str): Path to .ser file for helical parameter 'zetaW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_zetaW.ser>`_. Accepted formats: ser (edam:format_2330). 

28 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

29 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/bipop_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

30 properties (dict): 

31 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

32 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

33 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

34 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

35 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

36 

37 Examples: 

38 This is a use example of how to use the building block from Python:: 

39 

40 from biobb_dna.backbone.bipopulations import bipopulations 

41 

42 prop = { 

43 'sequence': 'GCAT', 

44 } 

45 bipopulations( 

46 input_epsilC_path='/path/to/epsilC.ser', 

47 input_epsilW_path='/path/to/epsilW.ser', 

48 input_zetaC_path='/path/to/zetaC.ser', 

49 input_zetaW_path='/path/to/zetaW.ser', 

50 output_csv_path='/path/to/table/output.csv', 

51 output_jpg_path='/path/to/table/output.jpg', 

52 properties=prop) 

53 Info: 

54 * wrapped_software: 

55 * name: In house 

56 * license: Apache-2.0 

57 * ontology: 

58 * name: EDAM 

59 * schema: http://edamontology.org/EDAM.owl 

60 

61 """ 

62 

63 def __init__( 

64 self, 

65 input_epsilC_path, 

66 input_epsilW_path, 

67 input_zetaC_path, 

68 input_zetaW_path, 

69 output_csv_path, 

70 output_jpg_path, 

71 properties=None, 

72 **kwargs, 

73 ) -> None: 

74 properties = properties or {} 

75 

76 # Call parent class constructor 

77 super().__init__(properties) 

78 self.locals_var_dict = locals().copy() 

79 

80 # Input/Output files 

81 self.io_dict = { 

82 "in": { 

83 "input_epsilC_path": input_epsilC_path, 

84 "input_epsilW_path": input_epsilW_path, 

85 "input_zetaC_path": input_zetaC_path, 

86 "input_zetaW_path": input_zetaW_path, 

87 }, 

88 "out": { 

89 "output_csv_path": output_csv_path, 

90 "output_jpg_path": output_jpg_path, 

91 }, 

92 } 

93 

94 self.properties = properties 

95 self.sequence = properties.get("sequence") 

96 # self.seqpos = properties.get("seqpos", None) 

97 self.seqpos = [ 

98 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

99 ] 

100 print("seqpos::::::::") 

101 print(self.seqpos) 

102 # Check the properties 

103 self.check_properties(properties) 

104 self.check_arguments() 

105 

106 @launchlogger 

107 def launch(self) -> int: 

108 """Execute the :class:`BIPopulations <backbone.bipopulations.BIPopulations>` object.""" 

109 

110 # Setup Biobb 

111 if self.check_restart(): 

112 return 0 

113 self.stage_files() 

114 

115 # check sequence 

116 if self.sequence is None or len(self.sequence) < 2: 

117 raise ValueError("sequence is null or too short!") 

118 

119 # check seqpos 

120 if self.seqpos: 

121 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

122 raise ValueError( 

123 f"seqpos values must be between 1 and {len(self.sequence) - 2}" 

124 ) 

125 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

126 raise ValueError("seqpos must be a list of at least two integers") 

127 else: 

128 self.seqpos = None # type: ignore 

129 

130 # read input files 

131 epsilC = read_series( 

132 self.stage_io_dict["in"]["input_epsilC_path"], usecols=self.seqpos 

133 ) 

134 epsilW = read_series( 

135 self.stage_io_dict["in"]["input_epsilW_path"], usecols=self.seqpos 

136 ) 

137 zetaC = read_series( 

138 self.stage_io_dict["in"]["input_zetaC_path"], usecols=self.seqpos 

139 ) 

140 zetaW = read_series( 

141 self.stage_io_dict["in"]["input_zetaW_path"], usecols=self.seqpos 

142 ) 

143 

144 # calculate difference between epsil and zeta parameters 

145 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence)) 

146 diff_epsil_zeta = self.get_angles_difference(epsilC, zetaC, epsilW, zetaW) 

147 

148 # calculate BI population 

149 BI = (diff_epsil_zeta < 0).sum(axis=0) * 100 / len(diff_epsil_zeta) 

150 BII = 100 - BI 

151 

152 # save table 

153 Bpopulations_df = pd.DataFrame( 

154 {"Nucleotide": xlabels, "BI population": BI, "BII population": BII} 

155 ) 

156 Bpopulations_df.to_csv( 

157 self.stage_io_dict["out"]["output_csv_path"], index=False 

158 ) 

159 

160 # save plot 

161 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

162 axs.bar(range(len(xlabels)), BI, label="BI") 

163 axs.bar(range(len(xlabels)), BII, bottom=BI, label="BII") 

164 # empty bar to divide both sequences 

165 axs.bar([len(BI) // 2], [100], color="white", label=None) 

166 axs.legend() 

167 axs.set_xticks(range(len(xlabels))) 

168 axs.set_xticklabels(xlabels, rotation=90) 

169 axs.set_xlabel("Nucleotide Sequence") 

170 axs.set_ylabel("BI/BII Population (%)") 

171 axs.set_title("Nucleotide parameter: BI/BII Population") 

172 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

173 plt.close() 

174 

175 # Copy files to host 

176 self.copy_to_host() 

177 

178 # Remove temporary file(s) 

179 self.remove_tmp_files() 

180 

181 self.check_arguments(output_files_created=True, raise_exception=False) 

182 

183 return 0 

184 

185 def get_xlabels(self, strand1, strand2): 

186 # get list of tetramers, except first and last two bases 

187 labelsW = list(strand1) 

188 labelsW[0] = f"{labelsW[0]}5'" 

189 labelsW[-1] = f"{labelsW[-1]}3'" 

190 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))] 

191 labelsC = list(strand2)[::-1] 

192 labelsC[0] = f"{labelsC[0]}5'" 

193 labelsC[-1] = f"{labelsC[-1]}3'" 

194 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

195 

196 if self.seqpos is not None: 

197 labelsC = [labelsC[i] for i in self.seqpos] 

198 labelsW = [labelsW[i] for i in self.seqpos] 

199 xlabels = labelsW + ["-"] + labelsC 

200 return xlabels 

201 

202 def get_angles_difference(self, epsilC, zetaC, epsilW, zetaW): 

203 # concatenate zeta and epsil arrays 

204 separator_df = pd.DataFrame({"-": nan}, index=range(len(zetaW))) 

205 zeta = pd.concat([zetaW, separator_df, zetaC[zetaC.columns[::-1]]], axis=1) 

206 epsil = pd.concat([epsilW, separator_df, epsilC[epsilC.columns[::-1]]], axis=1) 

207 

208 # difference between epsilon and zeta coordinates 

209 diff_epsil_zeta = epsil - zeta 

210 return diff_epsil_zeta 

211 

212 

213def bipopulations( 

214 input_epsilC_path: str, 

215 input_epsilW_path: str, 

216 input_zetaC_path: str, 

217 input_zetaW_path: str, 

218 output_csv_path: str, 

219 output_jpg_path: str, 

220 properties: Optional[dict] = None, 

221 **kwargs, 

222) -> int: 

223 """Create :class:`BIPopulations <dna.backbone.bipopulations.BIPopulations>` class and 

224 execute the: meth: `launch() <dna.backbone.bipopulations.BIPopulations.launch>` method.""" 

225 return BIPopulations(**dict(locals())).launch() 

226 

227 

228bipopulations.__doc__ = BIPopulations.__doc__ 

229main = BIPopulations.get_main(bipopulations, "Calculate BI/BII populations.") 

230 

231if __name__ == "__main__": 

232 main()