Coverage for biobb_ml/resampling/reg_resampler.py: 68%
76 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-03 14:57 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-10-03 14:57 +0000
1""" Class created by Atif Hassan for ease the resampling of continuous or regression datasets
2Source code:
3https://github.com/atif-hassan/Regression_ReSampling
4Tutorial:
5https://towardsdatascience.com/repurposing-traditional-resampling-techniques-for-regression-tasks-d1a9939dab5d
6"""
9class resampler:
10 def __init__(self):
11 import pandas as pd
12 from sklearn.preprocessing import LabelEncoder
13 from collections import Counter
14 import numpy as np
15 self.bins = 3
16 self.pd = pd
17 self.LabelEncoder = LabelEncoder
18 self.Counter = Counter
19 self.X = 0
20 self.Y_classes = 0
21 self.target = 0
22 self.np = np
24 # This function adds classes to each sample and returns the class list as a dataframe/numpy array (as per input)
25 # It also merges classes as and when required
26 def fit(self, X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2):
27 self.bins = bins
28 tmp = target
30 # If data is numpy, then convert it into pandas
31 if type(target) == int:
32 if target < 0:
33 target = X.shape[1]+target
34 tmp = target
35 self.X = self.pd.DataFrame()
36 for i in range(X.shape[1]):
37 if i != target:
38 # self.X[str(i)] = X[:,i]
39 self.X[str(i)] = X.iloc[:, i]
40 # self.X["target"] = X[:,target]
41 self.X["target"] = X.iloc[:, target]
42 # if no header, get new target position
43 target_pos = self.X.columns.get_loc('target')
44 target = "target"
45 else:
46 target_pos = None
47 self.X = X.copy()
49 # Use qcut if balanced binning is required
50 if balanced_binning:
51 self.Y_classes = self.pd.qcut(self.X[target], q=self.bins, precision=0)
52 else:
53 self.Y_classes = self.pd.cut(self.X[target], bins=self.bins)
55 y_cl = self.Y_classes.copy().unique()
56 ranges = []
57 for r in y_cl:
58 ranges.append([r.left, r.right])
60 # Pandas outputs ranges after binning. Convert ranges to classes
61 le = self.LabelEncoder()
62 self.Y_classes = le.fit_transform(self.Y_classes)
64 # Merge classes if number of neighbours is more than the number of samples
65 classes_count = list(map(list, self.Counter(self.Y_classes).items()))
66 classes_count = sorted(classes_count, key=lambda x: x[0])
67 # mid_point = len(classes_count)
68 # Logic for merging
69 for i in range(len(classes_count)):
70 if classes_count[i][1] < min_n_samples:
71 self.Y_classes[self.np.where(self.Y_classes == classes_count[i][0])[0]] = classes_count[i-1][0]
72 la = ranges[classes_count[i-1][0]][0]
73 ranges.pop(classes_count[i-1][0])
74 ranges[classes_count[i-1][0]][0] = la
75 if verbose > 0:
76 print("INFO: Class " + str(classes_count[i][0]) + " has been merged into Class " + str(classes_count[i-1][0]) + " due to low number of samples")
77 classes_count[i][0] = classes_count[i-1][0]
79 if verbose > 0:
80 print()
82 # Perform label-encoding once again
83 # Avoids class skipping after merging
84 le = self.LabelEncoder()
85 self.Y_classes = le.fit_transform(self.Y_classes)
87 # Pretty print
88 if verbose > 1:
89 print("Class Distribution:\n-------------------")
90 classes_count = list(map(list, self.Counter(self.Y_classes).items()))
91 classes_count = sorted(classes_count, key=lambda x: x[0])
92 for class_, count in classes_count:
93 print(str(class_)+": "+str(count))
94 print()
96 # Finally concatenate and return as dataframe or numpy
97 # Based on what type of target was sent
98 self.X["classes"] = self.Y_classes
99 if type(tmp) == int:
100 self.target = tmp
101 else:
102 self.target = target
103 return ranges, self.Y_classes, target_pos
105 # This function performs the re-sampling
106 def resample(self, sampler_obj, trainX, trainY):
107 # If classes haven't yet been created, then run the "fit" function
108 if type(self.Y_classes) == int:
109 print("Error! Run fit method first!!")
110 return None
112 # Finally, perform the re-sampling
113 resampled_data, _ = sampler_obj.fit_resample(trainX, trainY)
114 if type(resampled_data).__module__ == 'numpy':
115 resampled_data = self.pd.DataFrame(resampled_data, columns=self.X.drop("classes", axis=1).columns)
117 # Return the correct X and Y
118 if type(self.target) == int:
119 # return resampled_data.drop("target", axis=1).values, resampled_data["target"].values
120 return resampled_data.drop(self.target, axis=1).values, resampled_data[self.target].values
121 else:
122 return resampled_data.drop(self.target, axis=1), resampled_data[self.target]