From 971d90d349fa3f9d65313ae1c784a82d614b8802 Mon Sep 17 00:00:00 2001 From: jonathan Date: Mon, 11 Dec 2023 20:46:29 +0330 Subject: [PATCH] issue with non outliers not to be deleted --- AutoClean/autoclean.py | 6 +++--- AutoClean/modules.py | 22 ++++++++++++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/AutoClean/autoclean.py b/AutoClean/autoclean.py index f7e911c..691429c 100644 --- a/AutoClean/autoclean.py +++ b/AutoClean/autoclean.py @@ -11,7 +11,7 @@ class AutoClean: - def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False, encode_categ=False, extract_datetime=False, outliers=False, outlier_param=1.5, logfile=True, verbose=False): + def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False, encode_categ=False, extract_datetime=False, outliers=False, outlier_skip=[], outlier_param=1.5, logfile=True, verbose=False): ''' input_data (dataframe)..........Pandas dataframe mode (str)......................define in which mode you want to run AutoClean @@ -135,12 +135,12 @@ def _validate_params(self, df, verbose, logfile): logger.info('Completed validation of input parameters') return - def _clean_data(self, df, input_data): + def _clean_data(self, df, input_data, skip_duplicate=[]): # function for starting the autoclean process df = df.reset_index(drop=True) df = Duplicates.handle(self, df) df = MissingValues.handle(self, df) - df = Outliers.handle(self, df) + df = Outliers.handle(self, df, outlier_skip=[]) df = Adjust.convert_datetime(self, df) df = EncodeCateg.handle(self, df) df = Adjust.round_values(self, df, input_data) diff --git a/AutoClean/modules.py b/AutoClean/modules.py index 6a2102d..9f7df07 100644 --- a/AutoClean/modules.py +++ b/AutoClean/modules.py @@ -264,16 +264,18 @@ def _delete(self, df, type): class Outliers: - def handle(self, df): + def handle(self, df, outlier_skip): # function for handling of outliers in the data if self.outliers: + if not set(outlier_skip).issubset(df.columns): + raise ValueError("Some columns in outlier_skip do not exist in the DataFrame") logger.info('Started handling of outliers... Method: "{}"', str(self.outliers).upper()) start = timer() if self.outliers in ['auto', 'winz']: - df = Outliers._winsorization(self, df) + df = Outliers._winsorization(self, df, outlier_skip) elif self.outliers == 'delete': - df = Outliers._delete(self, df) + df = Outliers._delete(self, df, outlier_skip) end = timer() logger.info('Completed handling of outliers in {} seconds', round(end-start, 6)) @@ -281,10 +283,12 @@ def handle(self, df): logger.info('Skipped handling of outliers') return df - def _winsorization(self, df): + def _winsorization(self, df, outlier_skip): # function for outlier winsorization - cols_num = df.select_dtypes(include=np.number).columns - for feature in cols_num: + cols_num = [col for col in df.select_dtypes(include=np.number).columns if col not in outlier_skip] + for feature in cols_num: + if feature not in df.columns: + raise ValueError(f"Feature {feature} does not exist in the DataFrame") counter = 0 # compute outlier bounds lower_bound, upper_bound = Outliers._compute_bounds(self, df, feature) @@ -308,10 +312,12 @@ def _winsorization(self, df): logger.debug('Outlier imputation of {} value(s) succeeded for feature "{}"', counter, feature) return df - def _delete(self, df): + def _delete(self, df, outlier_skip): # function for deleting outliers in the data - cols_num = df.select_dtypes(include=np.number).columns + cols_num = [col for col in df.select_dtypes(include=np.number).columns if col not in outlier_skip] for feature in cols_num: + if feature not in df.columns: + raise ValueError(f"Feature {feature} does not exist in the DataFrame") counter = 0 lower_bound, upper_bound = Outliers._compute_bounds(self, df, feature) # delete observations containing outliers