Coverage for biobb_dna / backbone / puckering.py: 90%

92 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-15 18:49 +0000

1#!/usr/bin/env python3 

2"""Module containing the Puckering class and the command line interface.""" 

3 

4from typing import Optional 

5 

6import matplotlib.pyplot as plt 

7import numpy as np 

8import pandas as pd 

9from biobb_common.generic.biobb_object import BiobbObject 

10from biobb_common.tools.file_utils import launchlogger 

11 

12from biobb_dna.utils.common import _from_string_to_list 

13from biobb_dna.utils.loader import read_series 

14from biobb_dna.utils.transform import inverse_complement 

15 

16 

17class Puckering(BiobbObject): 

18 """ 

19 | biobb_dna Puckering 

20 | Calculate Puckering from phase parameters. 

21 | Calculate North/East/West/South distribution of sugar puckering backbone torsions. 

22 

23 Args: 

24 input_phaseC_path (str): Path to .ser file for helical parameter 'phaseC'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseC.ser>`_. Accepted formats: ser (edam:format_2330). 

25 input_phaseW_path (str): Path to .ser file for helical parameter 'phaseW'. File type: input. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/data/backbone/canal_output_phaseW.ser>`_. Accepted formats: ser (edam:format_2330). 

26 output_csv_path (str): Path to .csv file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.csv>`_. Accepted formats: csv (edam:format_3752). 

27 output_jpg_path (str): Path to .jpg file where output is saved. File type: output. `Sample file <https://raw.githubusercontent.com/bioexcel/biobb_dna/master/biobb_dna/test/reference/backbone/puckering_ref.jpg>`_. Accepted formats: jpg (edam:format_3579). 

28 properties (dict): 

29 * **sequence** (*str*) - (None) Nucleic acid sequence corresponding to the input .ser file. Length of sequence is expected to be the same as the total number of columns in the .ser file, minus the index column (even if later on a subset of columns is selected with the *seqpos* option). 

30 * **stride** (*int*) - (1000) granularity of the number of snapshots for plotting time series. 

31 * **seqpos** (*list*) - (None) list of sequence positions (columns indices starting by 0) to analyze. If not specified it will analyse the complete sequence. 

32 * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files. 

33 * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist. 

34 * **sandbox_path** (*str*) - ("./") [WF property] Parent path to the sandbox directory. 

35 

36 Examples: 

37 This is a use example of how to use the building block from Python:: 

38 

39 from biobb_dna.backbone.puckering import puckering 

40 

41 prop = { 

42 'sequence': 'GCAT', 

43 } 

44 puckering( 

45 input_phaseC_path='/path/to/phaseC.ser', 

46 input_phaseW_path='/path/to/phaseW.ser', 

47 output_csv_path='/path/to/table/output.csv', 

48 output_jpg_path='/path/to/table/output.jpg', 

49 properties=prop) 

50 Info: 

51 * wrapped_software: 

52 * name: In house 

53 * license: Apache-2.0 

54 * ontology: 

55 * name: EDAM 

56 * schema: http://edamontology.org/EDAM.owl 

57 

58 """ 

59 

60 def __init__( 

61 self, 

62 input_phaseC_path, 

63 input_phaseW_path, 

64 output_csv_path, 

65 output_jpg_path, 

66 properties=None, 

67 **kwargs, 

68 ) -> None: 

69 properties = properties or {} 

70 

71 # Call parent class constructor 

72 super().__init__(properties) 

73 self.locals_var_dict = locals().copy() 

74 

75 # Input/Output files 

76 self.io_dict = { 

77 "in": { 

78 "input_phaseC_path": input_phaseC_path, 

79 "input_phaseW_path": input_phaseW_path, 

80 }, 

81 "out": { 

82 "output_csv_path": output_csv_path, 

83 "output_jpg_path": output_jpg_path, 

84 }, 

85 } 

86 

87 self.properties = properties 

88 self.sequence = properties.get("sequence") 

89 self.stride = properties.get("stride", 1000) 

90 self.seqpos = [ 

91 int(elem) for elem in _from_string_to_list(properties.get("seqpos", None)) 

92 ] 

93 

94 # Check the properties 

95 self.check_properties(properties) 

96 self.check_arguments() 

97 

98 @launchlogger 

99 def launch(self) -> int: 

100 """Execute the :class:`Puckering <backbone.puckering.Puckering>` object.""" 

101 

102 # Setup Biobb 

103 if self.check_restart(): 

104 return 0 

105 self.stage_files() 

106 

107 # check sequence 

108 if self.sequence is None or len(self.sequence) < 2: 

109 raise ValueError("sequence is null or too short!") 

110 

111 # check seqpos 

112 if self.seqpos: 

113 if (max(self.seqpos) > len(self.sequence) - 2) or (min(self.seqpos) < 1): 

114 raise ValueError( 

115 f"seqpos values must be between 1 and {len(self.sequence) - 2}" 

116 ) 

117 if not (isinstance(self.seqpos, list) and len(self.seqpos) > 1): 

118 raise ValueError("seqpos must be a list of at least two integers") 

119 else: 

120 self.seqpos = None # type: ignore 

121 

122 # read input files 

123 phaseC = read_series( 

124 self.stage_io_dict["in"]["input_phaseC_path"], usecols=self.seqpos 

125 ) 

126 phaseW = read_series( 

127 self.stage_io_dict["in"]["input_phaseW_path"], usecols=self.seqpos 

128 ) 

129 

130 # fix angle range so its not negative 

131 phaseC = self.fix_angles(phaseC) 

132 phaseW = self.fix_angles(phaseW) 

133 

134 # calculate difference between epsil and zeta parameters 

135 xlabels = self.get_xlabels(self.sequence, inverse_complement(self.sequence)) 

136 Npop, Epop, Wpop, Spop = self.check_puckering(phaseC, phaseW) 

137 

138 # save plot 

139 fig, axs = plt.subplots(1, 1, dpi=300, tight_layout=True) 

140 axs.bar(range(len(xlabels)), Npop, label="North") 

141 axs.bar(range(len(xlabels)), Epop, bottom=Npop, label="East") 

142 axs.bar(range(len(xlabels)), Spop, bottom=Npop + Epop, label="South") 

143 axs.bar(range(len(xlabels)), Wpop, bottom=Npop + Epop + Spop, label="West") 

144 # empty bar to divide both sequences 

145 axs.bar([len(phaseC.columns)], [100], color="white", label=None) 

146 axs.legend() 

147 axs.set_xticks(range(len(xlabels))) 

148 axs.set_xticklabels(xlabels, rotation=90) 

149 axs.set_xlabel("Nucleotide Sequence") 

150 axs.set_ylabel("Puckering (%)") 

151 axs.set_title("Nucleotide parameter: Puckering") 

152 fig.savefig(self.stage_io_dict["out"]["output_jpg_path"], format="jpg") 

153 

154 # save table 

155 populations = pd.DataFrame( 

156 { 

157 "Nucleotide": xlabels, 

158 "North": Npop, 

159 "East": Epop, 

160 "West": Wpop, 

161 "South": Spop, 

162 } 

163 ) 

164 populations.to_csv(self.stage_io_dict["out"]["output_csv_path"], index=False) 

165 

166 plt.close() 

167 

168 # Copy files to host 

169 self.copy_to_host() 

170 

171 # Remove temporary file(s) 

172 self.remove_tmp_files() 

173 

174 self.check_arguments(output_files_created=True, raise_exception=False) 

175 

176 return 0 

177 

178 def get_xlabels(self, strand1, strand2): 

179 # get list of tetramers, except first and last two bases 

180 labelsW = list(strand1) 

181 labelsW[0] = f"{labelsW[0]}5'" 

182 labelsW[-1] = f"{labelsW[-1]}3'" 

183 labelsW = [f"{i}-{j}" for i, j in zip(labelsW, range(1, len(labelsW) + 1))] 

184 labelsC = list(strand2)[::-1] 

185 labelsC[0] = f"{labelsC[0]}5'" 

186 labelsC[-1] = f"{labelsC[-1]}3'" 

187 labelsC = [f"{i}-{j}" for i, j in zip(labelsC, range(len(labelsC), 0, -1))] 

188 

189 if self.seqpos: 

190 labelsC = [labelsC[i] for i in self.seqpos] 

191 labelsW = [labelsW[i] for i in self.seqpos] 

192 xlabels = labelsW + ["-"] + labelsC 

193 return xlabels 

194 

195 def check_puckering(self, phaseC, phaseW): 

196 separator_df = pd.DataFrame({"-": np.nan}, index=range(1, len(phaseC))) 

197 phase = pd.concat([phaseW, separator_df, phaseC[phaseC.columns[::-1]]], axis=1) 

198 # phase.columns = columns 

199 

200 Npop = np.logical_or(phase > 315, phase < 45).mean() * 100 

201 Epop = np.logical_and(phase > 45, phase < 135).mean() * 100 

202 Wpop = np.logical_and(phase > 225, phase < 315).mean() * 100 

203 Spop = np.logical_and(phase > 135, phase < 225).mean() * 100 

204 return Npop, Epop, Wpop, Spop 

205 

206 def fix_angles(self, dataset): 

207 values = np.where(dataset < 0, dataset + 360, dataset) 

208 # values = np.where(values > 360, values - 360, values) 

209 dataset = pd.DataFrame(values) 

210 return dataset 

211 

212 

213def puckering( 

214 input_phaseC_path: str, 

215 input_phaseW_path: str, 

216 output_csv_path: str, 

217 output_jpg_path: str, 

218 properties: Optional[dict] = None, 

219 **kwargs, 

220) -> int: 

221 """Create :class:`Puckering <dna.backbone.puckering.Puckering>` class and 

222 execute the: meth: `launch() <dna.backbone.puckering.Puckering.launch>` method.""" 

223 return Puckering(**dict(locals())).launch() 

224 

225 

226puckering.__doc__ = Puckering.__doc__ 

227main = Puckering.get_main(puckering, "Calculate North/East/West/South distribution of sugar puckering backbone torsions.") 

228 

229if __name__ == "__main__": 

230 main()