Coverage for biobb_ml/resampling/reg_resampler.py: 68%

76 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-10-03 14:57 +0000

1""" Class created by Atif Hassan for ease the resampling of continuous or regression datasets 

2Source code: 

3https://github.com/atif-hassan/Regression_ReSampling 

4Tutorial: 

5https://towardsdatascience.com/repurposing-traditional-resampling-techniques-for-regression-tasks-d1a9939dab5d 

6""" 

7 

8 

9class resampler: 

10 def __init__(self): 

11 import pandas as pd 

12 from sklearn.preprocessing import LabelEncoder 

13 from collections import Counter 

14 import numpy as np 

15 self.bins = 3 

16 self.pd = pd 

17 self.LabelEncoder = LabelEncoder 

18 self.Counter = Counter 

19 self.X = 0 

20 self.Y_classes = 0 

21 self.target = 0 

22 self.np = np 

23 

24 # This function adds classes to each sample and returns the class list as a dataframe/numpy array (as per input) 

25 # It also merges classes as and when required 

26 def fit(self, X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2): 

27 self.bins = bins 

28 tmp = target 

29 

30 # If data is numpy, then convert it into pandas 

31 if type(target) == int: 

32 if target < 0: 

33 target = X.shape[1]+target 

34 tmp = target 

35 self.X = self.pd.DataFrame() 

36 for i in range(X.shape[1]): 

37 if i != target: 

38 # self.X[str(i)] = X[:,i] 

39 self.X[str(i)] = X.iloc[:, i] 

40 # self.X["target"] = X[:,target] 

41 self.X["target"] = X.iloc[:, target] 

42 # if no header, get new target position 

43 target_pos = self.X.columns.get_loc('target') 

44 target = "target" 

45 else: 

46 target_pos = None 

47 self.X = X.copy() 

48 

49 # Use qcut if balanced binning is required 

50 if balanced_binning: 

51 self.Y_classes = self.pd.qcut(self.X[target], q=self.bins, precision=0) 

52 else: 

53 self.Y_classes = self.pd.cut(self.X[target], bins=self.bins) 

54 

55 y_cl = self.Y_classes.copy().unique() 

56 ranges = [] 

57 for r in y_cl: 

58 ranges.append([r.left, r.right]) 

59 

60 # Pandas outputs ranges after binning. Convert ranges to classes 

61 le = self.LabelEncoder() 

62 self.Y_classes = le.fit_transform(self.Y_classes) 

63 

64 # Merge classes if number of neighbours is more than the number of samples 

65 classes_count = list(map(list, self.Counter(self.Y_classes).items())) 

66 classes_count = sorted(classes_count, key=lambda x: x[0]) 

67 # mid_point = len(classes_count) 

68 # Logic for merging 

69 for i in range(len(classes_count)): 

70 if classes_count[i][1] < min_n_samples: 

71 self.Y_classes[self.np.where(self.Y_classes == classes_count[i][0])[0]] = classes_count[i-1][0] 

72 la = ranges[classes_count[i-1][0]][0] 

73 ranges.pop(classes_count[i-1][0]) 

74 ranges[classes_count[i-1][0]][0] = la 

75 if verbose > 0: 

76 print("INFO: Class " + str(classes_count[i][0]) + " has been merged into Class " + str(classes_count[i-1][0]) + " due to low number of samples") 

77 classes_count[i][0] = classes_count[i-1][0] 

78 

79 if verbose > 0: 

80 print() 

81 

82 # Perform label-encoding once again 

83 # Avoids class skipping after merging 

84 le = self.LabelEncoder() 

85 self.Y_classes = le.fit_transform(self.Y_classes) 

86 

87 # Pretty print 

88 if verbose > 1: 

89 print("Class Distribution:\n-------------------") 

90 classes_count = list(map(list, self.Counter(self.Y_classes).items())) 

91 classes_count = sorted(classes_count, key=lambda x: x[0]) 

92 for class_, count in classes_count: 

93 print(str(class_)+": "+str(count)) 

94 print() 

95 

96 # Finally concatenate and return as dataframe or numpy 

97 # Based on what type of target was sent 

98 self.X["classes"] = self.Y_classes 

99 if type(tmp) == int: 

100 self.target = tmp 

101 else: 

102 self.target = target 

103 return ranges, self.Y_classes, target_pos 

104 

105 # This function performs the re-sampling 

106 def resample(self, sampler_obj, trainX, trainY): 

107 # If classes haven't yet been created, then run the "fit" function 

108 if type(self.Y_classes) == int: 

109 print("Error! Run fit method first!!") 

110 return None 

111 

112 # Finally, perform the re-sampling 

113 resampled_data, _ = sampler_obj.fit_resample(trainX, trainY) 

114 if type(resampled_data).__module__ == 'numpy': 

115 resampled_data = self.pd.DataFrame(resampled_data, columns=self.X.drop("classes", axis=1).columns) 

116 

117 # Return the correct X and Y 

118 if type(self.target) == int: 

119 # return resampled_data.drop("target", axis=1).values, resampled_data["target"].values 

120 return resampled_data.drop(self.target, axis=1).values, resampled_data[self.target].values 

121 else: 

122 return resampled_data.drop(self.target, axis=1), resampled_data[self.target]