From 4b783e6a221d8fb92bdcc7dd1793054fe53eb3db Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:14:18 +0100 Subject: [PATCH 1/7] adding solution.py --- solution.py | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100755 solution.py diff --git a/solution.py b/solution.py new file mode 100755 index 0000000..1e2e07a --- /dev/null +++ b/solution.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +import argparse +import zipfile +import datetime + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import sklearn.metrics + + +def read_df(file_name): + print("Reading %s..." % file_name) + df = pd.read_csv(file_name, dtype={'id': str, 'Customer ID': str, + 'Product SKU': str, 'Price': str, + 'price': float, 'profit': float}) + original_len = len(df) + print('Read in %d rows...' % original_len) + df.dropna(inplace=True) + print("Dropped %d rows containing nan..." % (original_len - len(df))) + + if 'Customer ID' in df.columns: + df.rename(columns={'Customer ID': 'id'}, inplace=True) + print("Renamed Customer ID to id...") + + if 'Order Date' in df.columns: + df['Order Date'] = pd.to_datetime(df['Order Date'].str[:10], format="%d/%m/%Y") + else: + df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S") + print("Parsed dates...") + + if 'Order Date' in df.columns: + df.rename(columns={'Order Date': 'date'}, inplace=True) + print("Renamed Order Date to date...") + + if 'Price' in df.columns: + df['Price'] = df['Price'].str.replace(',', '').astype(float) + df.rename(columns={'Price': 'price'}, inplace=True) + print("Renamed Price to price...") + + original_len = len(df) + df = df[df['id'] != '-1'] + print("Dropped %d rows containing -1 in id column..." % (original_len - len(df))) + print() + + return df + + +def create_healthy_binary_target(df): + average_days_between_orders = df.groupby('id')['date'].apply( + lambda x: (x.max() - x.min()) / len(x)) + average_days_between_orders = average_days_between_orders[ + average_days_between_orders != datetime.timedelta(0)] + + print("Percentiles for average days between orders for repeated customers:") + print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1))) + print() + + quantile = average_days_between_orders.quantile(q=0.9) + print("90%% of the customers that make repeated purchase make it after %d days." % + quantile.days) + latest_date = df['date'].max() + + cut_off = latest_date - quantile + print("Latest date found in the dataset: %s." % latest_date) + print("Customers who have purchases after %s are considered healthy." % cut_off) + + df['healthy'] = df.groupby('id')['date'].transform(max) > cut_off + print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy'])) + + unique_customers = df.groupby('id')['healthy'].tail(1) + num_unique = len(unique_customers) + num_healthy = np.sum(unique_customers) + print("Number of unique customers: %d." % num_unique) + print("Number of healthy customers: %d." % num_healthy) + print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique)) + + return df + + +def add_features(df): + print("Adding secondsSinceRegistration feature...") + df['secondsSinceRegistration'] = df.groupby('id')['date'].transform( + lambda x: (x - x.min())).apply(lambda x: x.total_seconds()) + print("Adding numOfTransactions features...") + df['numOfTransactions'] = df.groupby('id')['date'].transform(lambda x: np.argsort(x) + 1) + + return df + + +def train_test_splitting(df): + train_ids, _ = train_test_split(df['id'].unique()) + train_mask = df['id'].isin(train_ids) + df_train = df[train_mask] + df_test = df[~train_mask] + print("Size of training data set: %d." % len(df_train)) + print("Size of test data set: %d." % len(df_test)) + train_mean = df_train['healthy'].mean() + print("Proportion of transactions corresponding to healthy customers in training dataset: " + "%0.2f." % train_mean) + test_mean = df_test['healthy'].mean() + print("Proportion of transactions corresponding to healthy customers in test dataset: %0.2f." % + test_mean) + + return df_train, df_test + + +def features(df): + features_ = ['price', 'secondsSinceRegistration', 'numOfTransactions'] + if 'profit' in df.columns: + features_.append('profit') + return features_ + + +def create_find_roc_metric(df_test): + orders_test_latest = df_test.sort_values('date').groupby('id').tail(1) + + def find_roc_metric(model): + healthy_proba = model.predict_proba(orders_test_latest[features(df_test)])[:, 1] + roc_metric = sklearn.metrics.roc_auc_score(orders_test_latest['healthy'], healthy_proba) + return roc_metric + + return find_roc_metric + + +def find_best_model(metric_func, df_train): + print("Finding the best random forest model...") + curr_area = -100 + curr_model = None + + search_dict = {} + + for min_impurity_decrease in list(np.arange(0.0, 0.3, 0.02)): + rf = sklearn.ensemble.RandomForestClassifier(min_impurity_decrease=min_impurity_decrease) + rf.fit(df_train[features], df_train['healthy']) + area = metric_func(rf) + if area > curr_area: + curr_area = area + curr_model = rf + + search_dict[min_impurity_decrease] = area + print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area)) + + return curr_model + + +def print_feature_importances(model, df): + print("Most important features:") + print(sorted(list(zip(features(df), model.feature_importances_)), key=lambda x: x[1], + reverse=True)) + + +def customers_health(model, df): + if 'health_score' in df: + del df['health_score'] + + print("Adding health score column...") + orders_latest = df.sort_values('date').groupby('id').tail(1) + proba = model.predict_proba(orders_latest[features(df)])[:, 1] + health_score = pd.DataFrame({'id': orders_latest['id'], 'health_score': proba}) + return df.join(health_score.set_index('id'), how='inner', on='id') + + +def print_csv(df): + print("Printig csv...") + print() + print("id,health_score") + print(df.groupby('id')['health_score'].max().to_csv()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Train health model.') + parser.add_argument('file_name', type=str, help='an integer for the accumulator') + + args = parser.parse_args() + file_name = args.file_name + + with zipfile.ZipFile("orders.zip", 'r') as zip_ref: + print("Extracting orders.zip...") + zip_ref.extractall('.') + + df = read_df(file_name) + df = create_healthy_binary_target(df) + df = add_features(df) + df_train, df_test = train_test_splitting(df) + roc_metric = create_find_roc_metric(df_test) + model = find_best_model(roc_metric, df_train) + print_feature_importances(model, df) + df = customers_health(model, df) + print("Saving full csv...") + df.to_csv(file_name[:-4] + '-new.csv', index=False) + print_csv(df) From 3b1f244f63672bfb7a48d192b4a6c7828133a53e Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:15:20 +0100 Subject: [PATCH 2/7] adding requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5ab9999 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +pandas +sklearn From d1e5214459be23b474f4238eb4317ee57db95ab6 Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:18:31 +0100 Subject: [PATCH 3/7] forgot import --- solution.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/solution.py b/solution.py index 1e2e07a..675f0f9 100755 --- a/solution.py +++ b/solution.py @@ -8,6 +8,7 @@ import numpy as np from sklearn.model_selection import train_test_split import sklearn.metrics +import sklearn.ensemble def read_df(file_name): @@ -53,7 +54,7 @@ def create_healthy_binary_target(df): average_days_between_orders = average_days_between_orders[ average_days_between_orders != datetime.timedelta(0)] - print("Percentiles for average days between orders for repeated customers:") + print("Percentiles for average days between orders for returning customers:") print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1))) print() From feb17a5e56bf544d6c3b8c6e8e6ee8098245a9de Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:31:12 +0100 Subject: [PATCH 4/7] flushing print --- solution.py | 68 +++++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/solution.py b/solution.py index 675f0f9..6131fd3 100755 --- a/solution.py +++ b/solution.py @@ -12,38 +12,38 @@ def read_df(file_name): - print("Reading %s..." % file_name) + print("Reading %s..." % file_name, flush=True) df = pd.read_csv(file_name, dtype={'id': str, 'Customer ID': str, 'Product SKU': str, 'Price': str, 'price': float, 'profit': float}) original_len = len(df) - print('Read in %d rows...' % original_len) + print('Read in %d rows...' % original_len, flush=True) df.dropna(inplace=True) - print("Dropped %d rows containing nan..." % (original_len - len(df))) + print("Dropped %d rows containing nan..." % (original_len - len(df)), flush=True) if 'Customer ID' in df.columns: df.rename(columns={'Customer ID': 'id'}, inplace=True) - print("Renamed Customer ID to id...") + print("Renamed Customer ID to id...", flush=True) if 'Order Date' in df.columns: df['Order Date'] = pd.to_datetime(df['Order Date'].str[:10], format="%d/%m/%Y") else: df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S") - print("Parsed dates...") + print("Parsed dates...", flush=True) if 'Order Date' in df.columns: df.rename(columns={'Order Date': 'date'}, inplace=True) - print("Renamed Order Date to date...") + print("Renamed Order Date to date...", flush=True) if 'Price' in df.columns: df['Price'] = df['Price'].str.replace(',', '').astype(float) df.rename(columns={'Price': 'price'}, inplace=True) - print("Renamed Price to price...") + print("Renamed Price to price...", flush=True) original_len = len(df) df = df[df['id'] != '-1'] - print("Dropped %d rows containing -1 in id column..." % (original_len - len(df))) - print() + print("Dropped %d rows containing -1 in id column..." % (original_len - len(df)), flush=True) + print(flush=True) return df @@ -54,37 +54,38 @@ def create_healthy_binary_target(df): average_days_between_orders = average_days_between_orders[ average_days_between_orders != datetime.timedelta(0)] - print("Percentiles for average days between orders for returning customers:") - print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1))) - print() + print("Percentiles for average days between orders for returning customers:", flush=True) + print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)), flush=True) + print(flush=True) quantile = average_days_between_orders.quantile(q=0.9) print("90%% of the customers that make repeated purchase make it after %d days." % - quantile.days) + quantile.days, flush=True) latest_date = df['date'].max() cut_off = latest_date - quantile - print("Latest date found in the dataset: %s." % latest_date) - print("Customers who have purchases after %s are considered healthy." % cut_off) + print("Latest date found in the dataset: %s." % latest_date, flush=True) + print("Customers who have purchases after %s are considered healthy." % cut_off, flush=True) df['healthy'] = df.groupby('id')['date'].transform(max) > cut_off - print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy'])) + print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy']), + flush=True) unique_customers = df.groupby('id')['healthy'].tail(1) num_unique = len(unique_customers) num_healthy = np.sum(unique_customers) - print("Number of unique customers: %d." % num_unique) - print("Number of healthy customers: %d." % num_healthy) - print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique)) + print("Number of unique customers: %d." % num_unique, flush=True) + print("Number of healthy customers: %d." % num_healthy, flush=True) + print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique), flush=True) return df def add_features(df): - print("Adding secondsSinceRegistration feature...") + print("Adding secondsSinceRegistration feature...", flush=True) df['secondsSinceRegistration'] = df.groupby('id')['date'].transform( lambda x: (x - x.min())).apply(lambda x: x.total_seconds()) - print("Adding numOfTransactions features...") + print("Adding numOfTransactions features...", flush=True) df['numOfTransactions'] = df.groupby('id')['date'].transform(lambda x: np.argsort(x) + 1) return df @@ -95,14 +96,14 @@ def train_test_splitting(df): train_mask = df['id'].isin(train_ids) df_train = df[train_mask] df_test = df[~train_mask] - print("Size of training data set: %d." % len(df_train)) - print("Size of test data set: %d." % len(df_test)) + print("Size of training data set: %d." % len(df_train), flush=True) + print("Size of test data set: %d." % len(df_test), flush=True) train_mean = df_train['healthy'].mean() print("Proportion of transactions corresponding to healthy customers in training dataset: " - "%0.2f." % train_mean) + "%0.2f." % train_mean, flush=True) test_mean = df_test['healthy'].mean() print("Proportion of transactions corresponding to healthy customers in test dataset: %0.2f." % - test_mean) + test_mean, flush=True) return df_train, df_test @@ -126,7 +127,7 @@ def find_roc_metric(model): def find_best_model(metric_func, df_train): - print("Finding the best random forest model...") + print("Finding the best random forest model...", flush=True) curr_area = -100 curr_model = None @@ -141,22 +142,23 @@ def find_best_model(metric_func, df_train): curr_model = rf search_dict[min_impurity_decrease] = area - print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area)) + print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area), + flush=True) return curr_model def print_feature_importances(model, df): - print("Most important features:") + print("Most important features:", flush=True) print(sorted(list(zip(features(df), model.feature_importances_)), key=lambda x: x[1], - reverse=True)) + reverse=True), flush=True) def customers_health(model, df): if 'health_score' in df: del df['health_score'] - print("Adding health score column...") + print("Adding health score column...", flush=True) orders_latest = df.sort_values('date').groupby('id').tail(1) proba = model.predict_proba(orders_latest[features(df)])[:, 1] health_score = pd.DataFrame({'id': orders_latest['id'], 'health_score': proba}) @@ -164,9 +166,9 @@ def customers_health(model, df): def print_csv(df): - print("Printig csv...") + print("Printig csv...", flush=True) print() - print("id,health_score") + print("id,health_score", flush=True) print(df.groupby('id')['health_score'].max().to_csv()) @@ -178,7 +180,7 @@ def print_csv(df): file_name = args.file_name with zipfile.ZipFile("orders.zip", 'r') as zip_ref: - print("Extracting orders.zip...") + print("Extracting orders.zip...", flush=True) zip_ref.extractall('.') df = read_df(file_name) From 12ba28e95caaf3bb05c32d72fae0fceecb316fa7 Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:36:06 +0100 Subject: [PATCH 5/7] adding orders-2 run log --- orders-2-run.log | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 orders-2-run.log diff --git a/orders-2-run.log b/orders-2-run.log new file mode 100644 index 0000000..f919653 --- /dev/null +++ b/orders-2-run.log @@ -0,0 +1,65 @@ +Extracting orders.zip... +Reading orders-2.csv... +Read in 499999 rows... +Dropped 5 rows containing nan... +Parsed dates... +Dropped 1 rows containing -1 in id column... + +Percentiles for average days between orders for returning customers: +count 51720 +mean 16 days 18:23:21.946281 +std 14 days 01:22:24.347529 +min 0 days 00:00:01 +0% 0 days 00:00:01 +10% 3 days 00:41:38.900000 +20% 5 days 12:14:16.070000 +30.0% 7 days 20:25:11.034782 +40% 10 days 08:18:29.133333 +50% 13 days 01:51:06.333333 +60.0% 16 days 05:22:57.191111 +70% 20 days 05:32:30.519999 +80% 25 days 18:30:30.033333 +90% 35 days 11:43:19.660000 +max 89 days 11:15:48.500000 +Name: date, dtype: object + +90% of the customers that make repeated purchase make it after 35 days. +Latest date found in the dataset: 2018-05-01 23:56:52. +Customers who have purchases after 2018-03-27 12:13:32.340000 are considered healthy. +Created 385972 transactions corresponding to healthy customers. +Number of unique customers: 86207. +Number of healthy customers: 39508. +Proportion of healthy customers: 0.46. +Adding secondsSinceRegistration feature... +Adding numOfTransactions features... +Size of training data set: 374392. +Size of test data set: 125601. +Proportion of transactions corresponding to healthy customers in training dataset: 0.77. +Proportion of transactions corresponding to healthy customers in test dataset: 0.77. +Finding the best random forest model... +Min impurity: 0.000. Area under ROC curve: 0.738. +Min impurity: 0.020. Area under ROC curve: 0.724. +Min impurity: 0.040. Area under ROC curve: 0.724. +Min impurity: 0.060. Area under ROC curve: 0.500. +Min impurity: 0.080. Area under ROC curve: 0.500. +Min impurity: 0.100. Area under ROC curve: 0.500. +Min impurity: 0.120. Area under ROC curve: 0.500. +Min impurity: 0.140. Area under ROC curve: 0.500. +Min impurity: 0.160. Area under ROC curve: 0.500. +Min impurity: 0.180. Area under ROC curve: 0.500. +Min impurity: 0.200. Area under ROC curve: 0.500. +Min impurity: 0.220. Area under ROC curve: 0.500. +Min impurity: 0.240. Area under ROC curve: 0.500. +Min impurity: 0.260. Area under ROC curve: 0.500. +Min impurity: 0.280. Area under ROC curve: 0.500. +Most important features: +[('secondsSinceRegistration', 0.32592714760036945), ('price', 0.27353931173752621), ('profit', 0.20867555823625111), ('numOfTransactions', 0.19185798242585314)] +Adding health score column... +Saving full csv... +Printig csv... + +id,health_score +0000b0000ac761edbe8480f8273fdb07,0.7 +00017b2e0c5ecad4d92f6d7b7f4fa1f0,1.0 +00018653eaf26f920973426529654866,0.325 +... From b3a8405a3c0d17eefbdff3ec398dd3e164b79f89 Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:37:33 +0100 Subject: [PATCH 6/7] adding orders-1 run log --- orders-1-run.log | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 orders-1-run.log diff --git a/orders-1-run.log b/orders-1-run.log new file mode 100644 index 0000000..f8d7dda --- /dev/null +++ b/orders-1-run.log @@ -0,0 +1,67 @@ +Extracting orders.zip... +Reading orders-1.csv... +Read in 533623 rows... +Dropped 65 rows containing nan... +Renamed Customer ID to id... +Parsed dates... +Renamed Order Date to date... +Renamed Price to price... +Dropped 1411 rows containing -1 in id column... + +Percentiles for average days between orders for returning customers: +count 51501 +mean 32 days 05:31:38.178793 +std 24 days 13:12:07.576172 +min 0 days 01:42:51.428571 +0% 0 days 01:42:51.428571 +10% 8 days 06:00:00 +20% 13 days 00:00:00 +30.0% 17 days 04:00:00 +40% 21 days 12:00:00 +50% 26 days 03:25:42.857142 +60.0% 31 days 10:40:00 +70% 38 days 08:00:00 +80% 47 days 16:00:00 +90% 64 days 00:00:00 +max 179 days 00:00:00 +Name: date, dtype: object + +90% of the customers that make repeated purchase make it after 64 days. +Latest date found in the dataset: 2018-06-27 00:00:00. +Customers who have purchases after 2018-04-24 00:00:00 are considered healthy. +Created 271183 transactions corresponding to healthy customers. +Number of unique customers: 123468. +Number of healthy customers: 39610. +Proportion of healthy customers: 0.32. +Adding secondsSinceRegistration feature... +Adding numOfTransactions features... +Size of training data set: 398630. +Size of test data set: 133517. +Proportion of transactions corresponding to healthy customers in training dataset: 0.51. +Proportion of transactions corresponding to healthy customers in test dataset: 0.51. +Finding the best random forest model... +Min impurity: 0.000. Area under ROC curve: 0.764. +Min impurity: 0.020. Area under ROC curve: 0.686. +Min impurity: 0.040. Area under ROC curve: 0.684. +Min impurity: 0.060. Area under ROC curve: 0.500. +Min impurity: 0.080. Area under ROC curve: 0.500. +Min impurity: 0.100. Area under ROC curve: 0.500. +Min impurity: 0.120. Area under ROC curve: 0.500. +Min impurity: 0.140. Area under ROC curve: 0.500. +Min impurity: 0.160. Area under ROC curve: 0.500. +Min impurity: 0.180. Area under ROC curve: 0.500. +Min impurity: 0.200. Area under ROC curve: 0.500. +Min impurity: 0.220. Area under ROC curve: 0.500. +Min impurity: 0.240. Area under ROC curve: 0.500. +Min impurity: 0.260. Area under ROC curve: 0.500. +Min impurity: 0.280. Area under ROC curve: 0.500. +Most important features: +[('secondsSinceRegistration', 0.45706872620175287), ('price', 0.41632602073665953), ('numOfTransactions', 0.1266052530615876)] +Adding health score column... +Saving full csv... +Printig csv... + +id,health_score +1000001170897,0.2505100887996181 +1000015470766,1.0 +... From 224013e8fba7416c0d60def115fc78ff9fe3fd8d Mon Sep 17 00:00:00 2001 From: Tom Artiom Fiodorov Date: Mon, 23 Jul 2018 01:56:37 +0100 Subject: [PATCH 7/7] better help --- solution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solution.py b/solution.py index 6131fd3..6d8ee3e 100755 --- a/solution.py +++ b/solution.py @@ -174,7 +174,7 @@ def print_csv(df): if __name__ == "__main__": parser = argparse.ArgumentParser(description='Train health model.') - parser.add_argument('file_name', type=str, help='an integer for the accumulator') + parser.add_argument('file_name', type=str, help='orders-1.csv or orders-2.csv.') args = parser.parse_args() file_name = args.file_name