Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions AutoClean/autoclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class AutoClean:

def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False, encode_categ=False, extract_datetime=False, outliers=False, outlier_param=1.5, logfile=True, verbose=False):
def __init__(self, input_data, mode='auto', duplicates=False, missing_num=False, missing_categ=False, encode_categ=False, extract_datetime=False, outliers=False, outlier_skip=[], outlier_param=1.5, logfile=True, verbose=False):
'''
input_data (dataframe)..........Pandas dataframe
mode (str)......................define in which mode you want to run AutoClean
Expand Down Expand Up @@ -135,12 +135,12 @@ def _validate_params(self, df, verbose, logfile):
logger.info('Completed validation of input parameters')
return

def _clean_data(self, df, input_data):
def _clean_data(self, df, input_data, skip_duplicate=[]):
# function for starting the autoclean process
df = df.reset_index(drop=True)
df = Duplicates.handle(self, df)
df = MissingValues.handle(self, df)
df = Outliers.handle(self, df)
df = Outliers.handle(self, df, outlier_skip=[])
df = Adjust.convert_datetime(self, df)
df = EncodeCateg.handle(self, df)
df = Adjust.round_values(self, df, input_data)
Expand Down
22 changes: 14 additions & 8 deletions AutoClean/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,27 +264,31 @@ def _delete(self, df, type):

class Outliers:

def handle(self, df):
def handle(self, df, outlier_skip):
# function for handling of outliers in the data
if self.outliers:
if not set(outlier_skip).issubset(df.columns):
raise ValueError("Some columns in outlier_skip do not exist in the DataFrame")
logger.info('Started handling of outliers... Method: "{}"', str(self.outliers).upper())
start = timer()

if self.outliers in ['auto', 'winz']:
df = Outliers._winsorization(self, df)
df = Outliers._winsorization(self, df, outlier_skip)
elif self.outliers == 'delete':
df = Outliers._delete(self, df)
df = Outliers._delete(self, df, outlier_skip)

end = timer()
logger.info('Completed handling of outliers in {} seconds', round(end-start, 6))
else:
logger.info('Skipped handling of outliers')
return df

def _winsorization(self, df):
def _winsorization(self, df, outlier_skip):
# function for outlier winsorization
cols_num = df.select_dtypes(include=np.number).columns
for feature in cols_num:
cols_num = [col for col in df.select_dtypes(include=np.number).columns if col not in outlier_skip]
for feature in cols_num:
if feature not in df.columns:
raise ValueError(f"Feature {feature} does not exist in the DataFrame")
counter = 0
# compute outlier bounds
lower_bound, upper_bound = Outliers._compute_bounds(self, df, feature)
Expand All @@ -308,10 +312,12 @@ def _winsorization(self, df):
logger.debug('Outlier imputation of {} value(s) succeeded for feature "{}"', counter, feature)
return df

def _delete(self, df):
def _delete(self, df, outlier_skip):
# function for deleting outliers in the data
cols_num = df.select_dtypes(include=np.number).columns
cols_num = [col for col in df.select_dtypes(include=np.number).columns if col not in outlier_skip]
for feature in cols_num:
if feature not in df.columns:
raise ValueError(f"Feature {feature} does not exist in the DataFrame")
counter = 0
lower_bound, upper_bound = Outliers._compute_bounds(self, df, feature)
# delete observations containing outliers
Expand Down