Coverage for biobb_ml/resampling/reg

1""" Class created by Atif Hassan for ease the resampling of continuous or regression datasets

2Source code:

3https://github.com/atif-hassan/Regression_ReSampling

4Tutorial:

5https://towardsdatascience.com/repurposing-traditional-resampling-techniques-for-regression-tasks-d1a9939dab5d

6"""

9class resampler:

10 def __init__(self):

11 import pandas as pd

12 from sklearn.preprocessing import LabelEncoder

13 from collections import Counter

14 import numpy as np

15 self.bins = 3

16 self.pd = pd

17 self.LabelEncoder = LabelEncoder

18 self.Counter = Counter

19 self.X = 0

20 self.Y_classes = 0

21 self.target = 0

22 self.np = np

24 # This function adds classes to each sample and returns the class list as a dataframe/numpy array (as per input)

25 # It also merges classes as and when required

26 def fit(self, X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2):

27 self.bins = bins

28 tmp = target

30 # If data is numpy, then convert it into pandas

31 if type(target) == int:

32 if target < 0:

33 target = X.shape[1]+target

34 tmp = target

35 self.X = self.pd.DataFrame()

36 for i in range(X.shape[1]):

37 if i != target:

38 # self.X[str(i)] = X[:,i]

39 self.X[str(i)] = X.iloc[:, i]

40 # self.X["target"] = X[:,target]

41 self.X["target"] = X.iloc[:, target]

42 # if no header, get new target position

43 target_pos = self.X.columns.get_loc('target')

44 target = "target"

45 else:

46 target_pos = None

47 self.X = X.copy()

49 # Use qcut if balanced binning is required

50 if balanced_binning:

51 self.Y_classes = self.pd.qcut(self.X[target], q=self.bins, precision=0)

52 else:

53 self.Y_classes = self.pd.cut(self.X[target], bins=self.bins)

55 y_cl = self.Y_classes.copy().unique()

56 ranges = []

57 for r in y_cl:

58 ranges.append([r.left, r.right])

60 # Pandas outputs ranges after binning. Convert ranges to classes

61 le = self.LabelEncoder()

62 self.Y_classes = le.fit_transform(self.Y_classes)

64 # Merge classes if number of neighbours is more than the number of samples

65 classes_count = list(map(list, self.Counter(self.Y_classes).items()))

66 classes_count = sorted(classes_count, key=lambda x: x[0])

67 # mid_point = len(classes_count)

68 # Logic for merging

69 for i in range(len(classes_count)):

70 if classes_count[i][1] < min_n_samples:

71 self.Y_classes[self.np.where(self.Y_classes == classes_count[i][0])[0]] = classes_count[i-1][0]

72 la = ranges[classes_count[i-1][0]][0]

73 ranges.pop(classes_count[i-1][0])

74 ranges[classes_count[i-1][0]][0] = la

75 if verbose > 0:

76 print("INFO: Class " + str(classes_count[i][0]) + " has been merged into Class " + str(classes_count[i-1][0]) + " due to low number of samples")

77 classes_count[i][0] = classes_count[i-1][0]

79 if verbose > 0:

80 print()

82 # Perform label-encoding once again

83 # Avoids class skipping after merging

84 le = self.LabelEncoder()

85 self.Y_classes = le.fit_transform(self.Y_classes)

87 # Pretty print

88 if verbose > 1:

89 print("Class Distribution:\n-------------------")

90 classes_count = list(map(list, self.Counter(self.Y_classes).items()))

91 classes_count = sorted(classes_count, key=lambda x: x[0])

92 for class_, count in classes_count:

93 print(str(class_)+": "+str(count))

94 print()

96 # Finally concatenate and return as dataframe or numpy

97 # Based on what type of target was sent

98 self.X["classes"] = self.Y_classes

99 if type(tmp) == int:

100 self.target = tmp

101 else:

102 self.target = target

103 return ranges, self.Y_classes, target_pos

104

105 # This function performs the re-sampling

106 def resample(self, sampler_obj, trainX, trainY):

107 # If classes haven't yet been created, then run the "fit" function

108 if type(self.Y_classes) == int:

109 print("Error! Run fit method first!!")

110 return None

111

112 # Finally, perform the re-sampling

113 resampled_data, _ = sampler_obj.fit_resample(trainX, trainY)

114 if type(resampled_data).__module__ == 'numpy':

115 resampled_data = self.pd.DataFrame(resampled_data, columns=self.X.drop("classes", axis=1).columns)

116

117 # Return the correct X and Y

118 if type(self.target) == int:

119 # return resampled_data.drop("target", axis=1).values, resampled_data["target"].values

120 return resampled_data.drop(self.target, axis=1).values, resampled_data[self.target].values

121 else:

122 return resampled_data.drop(self.target, axis=1), resampled_data[self.target]

Coverage for biobb_ml/resampling/reg_resampler.py: 68%

76 statements