diff --git a/orders-1-run.log b/orders-1-run.log new file mode 100644 index 0000000..f8d7dda --- /dev/null +++ b/orders-1-run.log @@ -0,0 +1,67 @@ +Extracting orders.zip... +Reading orders-1.csv... +Read in 533623 rows... +Dropped 65 rows containing nan... +Renamed Customer ID to id... +Parsed dates... +Renamed Order Date to date... +Renamed Price to price... +Dropped 1411 rows containing -1 in id column... + +Percentiles for average days between orders for returning customers: +count 51501 +mean 32 days 05:31:38.178793 +std 24 days 13:12:07.576172 +min 0 days 01:42:51.428571 +0% 0 days 01:42:51.428571 +10% 8 days 06:00:00 +20% 13 days 00:00:00 +30.0% 17 days 04:00:00 +40% 21 days 12:00:00 +50% 26 days 03:25:42.857142 +60.0% 31 days 10:40:00 +70% 38 days 08:00:00 +80% 47 days 16:00:00 +90% 64 days 00:00:00 +max 179 days 00:00:00 +Name: date, dtype: object + +90% of the customers that make repeated purchase make it after 64 days. +Latest date found in the dataset: 2018-06-27 00:00:00. +Customers who have purchases after 2018-04-24 00:00:00 are considered healthy. +Created 271183 transactions corresponding to healthy customers. +Number of unique customers: 123468. +Number of healthy customers: 39610. +Proportion of healthy customers: 0.32. +Adding secondsSinceRegistration feature... +Adding numOfTransactions features... +Size of training data set: 398630. +Size of test data set: 133517. +Proportion of transactions corresponding to healthy customers in training dataset: 0.51. +Proportion of transactions corresponding to healthy customers in test dataset: 0.51. +Finding the best random forest model... +Min impurity: 0.000. Area under ROC curve: 0.764. +Min impurity: 0.020. Area under ROC curve: 0.686. +Min impurity: 0.040. Area under ROC curve: 0.684. +Min impurity: 0.060. Area under ROC curve: 0.500. +Min impurity: 0.080. Area under ROC curve: 0.500. +Min impurity: 0.100. Area under ROC curve: 0.500. +Min impurity: 0.120. Area under ROC curve: 0.500. +Min impurity: 0.140. Area under ROC curve: 0.500. +Min impurity: 0.160. Area under ROC curve: 0.500. +Min impurity: 0.180. Area under ROC curve: 0.500. +Min impurity: 0.200. Area under ROC curve: 0.500. +Min impurity: 0.220. Area under ROC curve: 0.500. +Min impurity: 0.240. Area under ROC curve: 0.500. +Min impurity: 0.260. Area under ROC curve: 0.500. +Min impurity: 0.280. Area under ROC curve: 0.500. +Most important features: +[('secondsSinceRegistration', 0.45706872620175287), ('price', 0.41632602073665953), ('numOfTransactions', 0.1266052530615876)] +Adding health score column... +Saving full csv... +Printig csv... + +id,health_score +1000001170897,0.2505100887996181 +1000015470766,1.0 +... diff --git a/orders-2-run.log b/orders-2-run.log new file mode 100644 index 0000000..f919653 --- /dev/null +++ b/orders-2-run.log @@ -0,0 +1,65 @@ +Extracting orders.zip... +Reading orders-2.csv... +Read in 499999 rows... +Dropped 5 rows containing nan... +Parsed dates... +Dropped 1 rows containing -1 in id column... + +Percentiles for average days between orders for returning customers: +count 51720 +mean 16 days 18:23:21.946281 +std 14 days 01:22:24.347529 +min 0 days 00:00:01 +0% 0 days 00:00:01 +10% 3 days 00:41:38.900000 +20% 5 days 12:14:16.070000 +30.0% 7 days 20:25:11.034782 +40% 10 days 08:18:29.133333 +50% 13 days 01:51:06.333333 +60.0% 16 days 05:22:57.191111 +70% 20 days 05:32:30.519999 +80% 25 days 18:30:30.033333 +90% 35 days 11:43:19.660000 +max 89 days 11:15:48.500000 +Name: date, dtype: object + +90% of the customers that make repeated purchase make it after 35 days. +Latest date found in the dataset: 2018-05-01 23:56:52. +Customers who have purchases after 2018-03-27 12:13:32.340000 are considered healthy. +Created 385972 transactions corresponding to healthy customers. +Number of unique customers: 86207. +Number of healthy customers: 39508. +Proportion of healthy customers: 0.46. +Adding secondsSinceRegistration feature... +Adding numOfTransactions features... +Size of training data set: 374392. +Size of test data set: 125601. +Proportion of transactions corresponding to healthy customers in training dataset: 0.77. +Proportion of transactions corresponding to healthy customers in test dataset: 0.77. +Finding the best random forest model... +Min impurity: 0.000. Area under ROC curve: 0.738. +Min impurity: 0.020. Area under ROC curve: 0.724. +Min impurity: 0.040. Area under ROC curve: 0.724. +Min impurity: 0.060. Area under ROC curve: 0.500. +Min impurity: 0.080. Area under ROC curve: 0.500. +Min impurity: 0.100. Area under ROC curve: 0.500. +Min impurity: 0.120. Area under ROC curve: 0.500. +Min impurity: 0.140. Area under ROC curve: 0.500. +Min impurity: 0.160. Area under ROC curve: 0.500. +Min impurity: 0.180. Area under ROC curve: 0.500. +Min impurity: 0.200. Area under ROC curve: 0.500. +Min impurity: 0.220. Area under ROC curve: 0.500. +Min impurity: 0.240. Area under ROC curve: 0.500. +Min impurity: 0.260. Area under ROC curve: 0.500. +Min impurity: 0.280. Area under ROC curve: 0.500. +Most important features: +[('secondsSinceRegistration', 0.32592714760036945), ('price', 0.27353931173752621), ('profit', 0.20867555823625111), ('numOfTransactions', 0.19185798242585314)] +Adding health score column... +Saving full csv... +Printig csv... + +id,health_score +0000b0000ac761edbe8480f8273fdb07,0.7 +00017b2e0c5ecad4d92f6d7b7f4fa1f0,1.0 +00018653eaf26f920973426529654866,0.325 +... diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5ab9999 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +pandas +sklearn diff --git a/solution.py b/solution.py new file mode 100755 index 0000000..6d8ee3e --- /dev/null +++ b/solution.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +import argparse +import zipfile +import datetime + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import sklearn.metrics +import sklearn.ensemble + + +def read_df(file_name): + print("Reading %s..." % file_name, flush=True) + df = pd.read_csv(file_name, dtype={'id': str, 'Customer ID': str, + 'Product SKU': str, 'Price': str, + 'price': float, 'profit': float}) + original_len = len(df) + print('Read in %d rows...' % original_len, flush=True) + df.dropna(inplace=True) + print("Dropped %d rows containing nan..." % (original_len - len(df)), flush=True) + + if 'Customer ID' in df.columns: + df.rename(columns={'Customer ID': 'id'}, inplace=True) + print("Renamed Customer ID to id...", flush=True) + + if 'Order Date' in df.columns: + df['Order Date'] = pd.to_datetime(df['Order Date'].str[:10], format="%d/%m/%Y") + else: + df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S") + print("Parsed dates...", flush=True) + + if 'Order Date' in df.columns: + df.rename(columns={'Order Date': 'date'}, inplace=True) + print("Renamed Order Date to date...", flush=True) + + if 'Price' in df.columns: + df['Price'] = df['Price'].str.replace(',', '').astype(float) + df.rename(columns={'Price': 'price'}, inplace=True) + print("Renamed Price to price...", flush=True) + + original_len = len(df) + df = df[df['id'] != '-1'] + print("Dropped %d rows containing -1 in id column..." % (original_len - len(df)), flush=True) + print(flush=True) + + return df + + +def create_healthy_binary_target(df): + average_days_between_orders = df.groupby('id')['date'].apply( + lambda x: (x.max() - x.min()) / len(x)) + average_days_between_orders = average_days_between_orders[ + average_days_between_orders != datetime.timedelta(0)] + + print("Percentiles for average days between orders for returning customers:", flush=True) + print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)), flush=True) + print(flush=True) + + quantile = average_days_between_orders.quantile(q=0.9) + print("90%% of the customers that make repeated purchase make it after %d days." % + quantile.days, flush=True) + latest_date = df['date'].max() + + cut_off = latest_date - quantile + print("Latest date found in the dataset: %s." % latest_date, flush=True) + print("Customers who have purchases after %s are considered healthy." % cut_off, flush=True) + + df['healthy'] = df.groupby('id')['date'].transform(max) > cut_off + print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy']), + flush=True) + + unique_customers = df.groupby('id')['healthy'].tail(1) + num_unique = len(unique_customers) + num_healthy = np.sum(unique_customers) + print("Number of unique customers: %d." % num_unique, flush=True) + print("Number of healthy customers: %d." % num_healthy, flush=True) + print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique), flush=True) + + return df + + +def add_features(df): + print("Adding secondsSinceRegistration feature...", flush=True) + df['secondsSinceRegistration'] = df.groupby('id')['date'].transform( + lambda x: (x - x.min())).apply(lambda x: x.total_seconds()) + print("Adding numOfTransactions features...", flush=True) + df['numOfTransactions'] = df.groupby('id')['date'].transform(lambda x: np.argsort(x) + 1) + + return df + + +def train_test_splitting(df): + train_ids, _ = train_test_split(df['id'].unique()) + train_mask = df['id'].isin(train_ids) + df_train = df[train_mask] + df_test = df[~train_mask] + print("Size of training data set: %d." % len(df_train), flush=True) + print("Size of test data set: %d." % len(df_test), flush=True) + train_mean = df_train['healthy'].mean() + print("Proportion of transactions corresponding to healthy customers in training dataset: " + "%0.2f." % train_mean, flush=True) + test_mean = df_test['healthy'].mean() + print("Proportion of transactions corresponding to healthy customers in test dataset: %0.2f." % + test_mean, flush=True) + + return df_train, df_test + + +def features(df): + features_ = ['price', 'secondsSinceRegistration', 'numOfTransactions'] + if 'profit' in df.columns: + features_.append('profit') + return features_ + + +def create_find_roc_metric(df_test): + orders_test_latest = df_test.sort_values('date').groupby('id').tail(1) + + def find_roc_metric(model): + healthy_proba = model.predict_proba(orders_test_latest[features(df_test)])[:, 1] + roc_metric = sklearn.metrics.roc_auc_score(orders_test_latest['healthy'], healthy_proba) + return roc_metric + + return find_roc_metric + + +def find_best_model(metric_func, df_train): + print("Finding the best random forest model...", flush=True) + curr_area = -100 + curr_model = None + + search_dict = {} + + for min_impurity_decrease in list(np.arange(0.0, 0.3, 0.02)): + rf = sklearn.ensemble.RandomForestClassifier(min_impurity_decrease=min_impurity_decrease) + rf.fit(df_train[features], df_train['healthy']) + area = metric_func(rf) + if area > curr_area: + curr_area = area + curr_model = rf + + search_dict[min_impurity_decrease] = area + print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area), + flush=True) + + return curr_model + + +def print_feature_importances(model, df): + print("Most important features:", flush=True) + print(sorted(list(zip(features(df), model.feature_importances_)), key=lambda x: x[1], + reverse=True), flush=True) + + +def customers_health(model, df): + if 'health_score' in df: + del df['health_score'] + + print("Adding health score column...", flush=True) + orders_latest = df.sort_values('date').groupby('id').tail(1) + proba = model.predict_proba(orders_latest[features(df)])[:, 1] + health_score = pd.DataFrame({'id': orders_latest['id'], 'health_score': proba}) + return df.join(health_score.set_index('id'), how='inner', on='id') + + +def print_csv(df): + print("Printig csv...", flush=True) + print() + print("id,health_score", flush=True) + print(df.groupby('id')['health_score'].max().to_csv()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Train health model.') + parser.add_argument('file_name', type=str, help='orders-1.csv or orders-2.csv.') + + args = parser.parse_args() + file_name = args.file_name + + with zipfile.ZipFile("orders.zip", 'r') as zip_ref: + print("Extracting orders.zip...", flush=True) + zip_ref.extractall('.') + + df = read_df(file_name) + df = create_healthy_binary_target(df) + df = add_features(df) + df_train, df_test = train_test_splitting(df) + roc_metric = create_find_roc_metric(df_test) + model = find_best_model(roc_metric, df_train) + print_feature_importances(model, df) + df = customers_health(model, df) + print("Saving full csv...") + df.to_csv(file_name[:-4] + '-new.csv', index=False) + print_csv(df)