From 4b783e6a221d8fb92bdcc7dd1793054fe53eb3db Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:14:18 +0100
Subject: [PATCH 1/7] adding solution.py

---
 solution.py | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100755 solution.py

diff --git a/solution.py b/solution.py
new file mode 100755
index 0000000..1e2e07a
--- /dev/null
+++ b/solution.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+import argparse
+import zipfile
+import datetime
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+import sklearn.metrics
+
+
+def read_df(file_name):
+    print("Reading %s..." % file_name)
+    df = pd.read_csv(file_name, dtype={'id': str, 'Customer ID': str,
+                                       'Product SKU': str, 'Price': str,
+                                       'price': float, 'profit': float})
+    original_len = len(df)
+    print('Read in %d rows...' % original_len)
+    df.dropna(inplace=True)
+    print("Dropped %d rows containing nan..." % (original_len - len(df)))
+
+    if 'Customer ID' in df.columns:
+        df.rename(columns={'Customer ID': 'id'}, inplace=True)
+        print("Renamed Customer ID to id...")
+
+    if 'Order Date' in df.columns:
+        df['Order Date'] = pd.to_datetime(df['Order Date'].str[:10], format="%d/%m/%Y")
+    else:
+        df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S")
+    print("Parsed dates...")
+
+    if 'Order Date' in df.columns:
+        df.rename(columns={'Order Date': 'date'}, inplace=True)
+        print("Renamed Order Date to date...")
+
+    if 'Price' in df.columns:
+        df['Price'] = df['Price'].str.replace(',', '').astype(float)
+        df.rename(columns={'Price': 'price'}, inplace=True)
+        print("Renamed Price to price...")
+
+    original_len = len(df)
+    df = df[df['id'] != '-1']
+    print("Dropped %d rows containing -1 in id column..." % (original_len - len(df)))
+    print()
+
+    return df
+
+
+def create_healthy_binary_target(df):
+    average_days_between_orders = df.groupby('id')['date'].apply(
+        lambda x: (x.max() - x.min()) / len(x))
+    average_days_between_orders = average_days_between_orders[
+        average_days_between_orders != datetime.timedelta(0)]
+
+    print("Percentiles for average days between orders for repeated customers:")
+    print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)))
+    print()
+
+    quantile = average_days_between_orders.quantile(q=0.9)
+    print("90%% of the customers that make repeated purchase make it after %d days." %
+          quantile.days)
+    latest_date = df['date'].max()
+
+    cut_off = latest_date - quantile
+    print("Latest date found in the dataset: %s." % latest_date)
+    print("Customers who have purchases after %s are considered healthy." % cut_off)
+
+    df['healthy'] = df.groupby('id')['date'].transform(max) > cut_off
+    print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy']))
+
+    unique_customers = df.groupby('id')['healthy'].tail(1)
+    num_unique = len(unique_customers)
+    num_healthy = np.sum(unique_customers)
+    print("Number of unique customers: %d." % num_unique)
+    print("Number of healthy customers: %d." % num_healthy)
+    print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique))
+
+    return df
+
+
+def add_features(df):
+    print("Adding secondsSinceRegistration feature...")
+    df['secondsSinceRegistration'] = df.groupby('id')['date'].transform(
+        lambda x: (x - x.min())).apply(lambda x: x.total_seconds())
+    print("Adding numOfTransactions features...")
+    df['numOfTransactions'] = df.groupby('id')['date'].transform(lambda x: np.argsort(x) + 1)
+
+    return df
+
+
+def train_test_splitting(df):
+    train_ids, _ = train_test_split(df['id'].unique())
+    train_mask = df['id'].isin(train_ids)
+    df_train = df[train_mask]
+    df_test = df[~train_mask]
+    print("Size of training data set: %d." % len(df_train))
+    print("Size of test data set: %d." % len(df_test))
+    train_mean = df_train['healthy'].mean()
+    print("Proportion of transactions corresponding to healthy customers in training dataset: "
+          "%0.2f." % train_mean)
+    test_mean = df_test['healthy'].mean()
+    print("Proportion of transactions corresponding to healthy customers in test dataset: %0.2f." %
+          test_mean)
+
+    return df_train, df_test
+
+
+def features(df):
+    features_ = ['price', 'secondsSinceRegistration', 'numOfTransactions']
+    if 'profit' in df.columns:
+        features_.append('profit')
+    return features_
+
+
+def create_find_roc_metric(df_test):
+    orders_test_latest = df_test.sort_values('date').groupby('id').tail(1)
+
+    def find_roc_metric(model):
+        healthy_proba = model.predict_proba(orders_test_latest[features(df_test)])[:, 1]
+        roc_metric = sklearn.metrics.roc_auc_score(orders_test_latest['healthy'], healthy_proba)
+        return roc_metric
+
+    return find_roc_metric
+
+
+def find_best_model(metric_func, df_train):
+    print("Finding the best random forest model...")
+    curr_area = -100
+    curr_model = None
+
+    search_dict = {}
+
+    for min_impurity_decrease in list(np.arange(0.0, 0.3, 0.02)):
+        rf = sklearn.ensemble.RandomForestClassifier(min_impurity_decrease=min_impurity_decrease)
+        rf.fit(df_train[features], df_train['healthy'])
+        area = metric_func(rf)
+        if area > curr_area:
+            curr_area = area
+            curr_model = rf
+
+        search_dict[min_impurity_decrease] = area
+        print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area))
+
+    return curr_model
+
+
+def print_feature_importances(model, df):
+    print("Most important features:")
+    print(sorted(list(zip(features(df), model.feature_importances_)), key=lambda x: x[1],
+                 reverse=True))
+
+
+def customers_health(model, df):
+    if 'health_score' in df:
+        del df['health_score']
+
+    print("Adding health score column...")
+    orders_latest = df.sort_values('date').groupby('id').tail(1)
+    proba = model.predict_proba(orders_latest[features(df)])[:, 1]
+    health_score = pd.DataFrame({'id': orders_latest['id'], 'health_score': proba})
+    return df.join(health_score.set_index('id'), how='inner', on='id')
+
+
+def print_csv(df):
+    print("Printig csv...")
+    print()
+    print("id,health_score")
+    print(df.groupby('id')['health_score'].max().to_csv())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train health model.')
+    parser.add_argument('file_name', type=str, help='an integer for the accumulator')
+
+    args = parser.parse_args()
+    file_name = args.file_name
+
+    with zipfile.ZipFile("orders.zip", 'r') as zip_ref:
+        print("Extracting orders.zip...")
+        zip_ref.extractall('.')
+
+    df = read_df(file_name)
+    df = create_healthy_binary_target(df)
+    df = add_features(df)
+    df_train, df_test = train_test_splitting(df)
+    roc_metric = create_find_roc_metric(df_test)
+    model = find_best_model(roc_metric, df_train)
+    print_feature_importances(model, df)
+    df = customers_health(model, df)
+    print("Saving full csv...")
+    df.to_csv(file_name[:-4] + '-new.csv', index=False)
+    print_csv(df)

From 3b1f244f63672bfb7a48d192b4a6c7828133a53e Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:15:20 +0100
Subject: [PATCH 2/7] adding requirements.txt

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..5ab9999
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+pandas
+sklearn

From d1e5214459be23b474f4238eb4317ee57db95ab6 Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:18:31 +0100
Subject: [PATCH 3/7] forgot import

---
 solution.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/solution.py b/solution.py
index 1e2e07a..675f0f9 100755
--- a/solution.py
+++ b/solution.py
@@ -8,6 +8,7 @@
 import numpy as np
 from sklearn.model_selection import train_test_split
 import sklearn.metrics
+import sklearn.ensemble
 
 
 def read_df(file_name):
@@ -53,7 +54,7 @@ def create_healthy_binary_target(df):
     average_days_between_orders = average_days_between_orders[
         average_days_between_orders != datetime.timedelta(0)]
 
-    print("Percentiles for average days between orders for repeated customers:")
+    print("Percentiles for average days between orders for returning customers:")
     print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)))
     print()
 

From feb17a5e56bf544d6c3b8c6e8e6ee8098245a9de Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:31:12 +0100
Subject: [PATCH 4/7] flushing print

---
 solution.py | 68 +++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/solution.py b/solution.py
index 675f0f9..6131fd3 100755
--- a/solution.py
+++ b/solution.py
@@ -12,38 +12,38 @@
 
 
 def read_df(file_name):
-    print("Reading %s..." % file_name)
+    print("Reading %s..." % file_name, flush=True)
     df = pd.read_csv(file_name, dtype={'id': str, 'Customer ID': str,
                                        'Product SKU': str, 'Price': str,
                                        'price': float, 'profit': float})
     original_len = len(df)
-    print('Read in %d rows...' % original_len)
+    print('Read in %d rows...' % original_len, flush=True)
     df.dropna(inplace=True)
-    print("Dropped %d rows containing nan..." % (original_len - len(df)))
+    print("Dropped %d rows containing nan..." % (original_len - len(df)), flush=True)
 
     if 'Customer ID' in df.columns:
         df.rename(columns={'Customer ID': 'id'}, inplace=True)
-        print("Renamed Customer ID to id...")
+        print("Renamed Customer ID to id...", flush=True)
 
     if 'Order Date' in df.columns:
         df['Order Date'] = pd.to_datetime(df['Order Date'].str[:10], format="%d/%m/%Y")
     else:
         df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d %H:%M:%S")
-    print("Parsed dates...")
+    print("Parsed dates...", flush=True)
 
     if 'Order Date' in df.columns:
         df.rename(columns={'Order Date': 'date'}, inplace=True)
-        print("Renamed Order Date to date...")
+        print("Renamed Order Date to date...", flush=True)
 
     if 'Price' in df.columns:
         df['Price'] = df['Price'].str.replace(',', '').astype(float)
         df.rename(columns={'Price': 'price'}, inplace=True)
-        print("Renamed Price to price...")
+        print("Renamed Price to price...", flush=True)
 
     original_len = len(df)
     df = df[df['id'] != '-1']
-    print("Dropped %d rows containing -1 in id column..." % (original_len - len(df)))
-    print()
+    print("Dropped %d rows containing -1 in id column..." % (original_len - len(df)), flush=True)
+    print(flush=True)
 
     return df
 
@@ -54,37 +54,38 @@ def create_healthy_binary_target(df):
     average_days_between_orders = average_days_between_orders[
         average_days_between_orders != datetime.timedelta(0)]
 
-    print("Percentiles for average days between orders for returning customers:")
-    print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)))
-    print()
+    print("Percentiles for average days between orders for returning customers:", flush=True)
+    print(average_days_between_orders.describe(percentiles=np.arange(0, 1, 0.1)), flush=True)
+    print(flush=True)
 
     quantile = average_days_between_orders.quantile(q=0.9)
     print("90%% of the customers that make repeated purchase make it after %d days." %
-          quantile.days)
+          quantile.days, flush=True)
     latest_date = df['date'].max()
 
     cut_off = latest_date - quantile
-    print("Latest date found in the dataset: %s." % latest_date)
-    print("Customers who have purchases after %s are considered healthy." % cut_off)
+    print("Latest date found in the dataset: %s." % latest_date, flush=True)
+    print("Customers who have purchases after %s are considered healthy." % cut_off, flush=True)
 
     df['healthy'] = df.groupby('id')['date'].transform(max) > cut_off
-    print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy']))
+    print("Created %d transactions corresponding to healthy customers." % np.sum(df['healthy']),
+          flush=True)
 
     unique_customers = df.groupby('id')['healthy'].tail(1)
     num_unique = len(unique_customers)
     num_healthy = np.sum(unique_customers)
-    print("Number of unique customers: %d." % num_unique)
-    print("Number of healthy customers: %d." % num_healthy)
-    print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique))
+    print("Number of unique customers: %d." % num_unique, flush=True)
+    print("Number of healthy customers: %d." % num_healthy, flush=True)
+    print("Proportion of healthy customers: %0.2f." % (num_healthy / num_unique), flush=True)
 
     return df
 
 
 def add_features(df):
-    print("Adding secondsSinceRegistration feature...")
+    print("Adding secondsSinceRegistration feature...", flush=True)
     df['secondsSinceRegistration'] = df.groupby('id')['date'].transform(
         lambda x: (x - x.min())).apply(lambda x: x.total_seconds())
-    print("Adding numOfTransactions features...")
+    print("Adding numOfTransactions features...", flush=True)
     df['numOfTransactions'] = df.groupby('id')['date'].transform(lambda x: np.argsort(x) + 1)
 
     return df
@@ -95,14 +96,14 @@ def train_test_splitting(df):
     train_mask = df['id'].isin(train_ids)
     df_train = df[train_mask]
     df_test = df[~train_mask]
-    print("Size of training data set: %d." % len(df_train))
-    print("Size of test data set: %d." % len(df_test))
+    print("Size of training data set: %d." % len(df_train), flush=True)
+    print("Size of test data set: %d." % len(df_test), flush=True)
     train_mean = df_train['healthy'].mean()
     print("Proportion of transactions corresponding to healthy customers in training dataset: "
-          "%0.2f." % train_mean)
+          "%0.2f." % train_mean, flush=True)
     test_mean = df_test['healthy'].mean()
     print("Proportion of transactions corresponding to healthy customers in test dataset: %0.2f." %
-          test_mean)
+          test_mean, flush=True)
 
     return df_train, df_test
 
@@ -126,7 +127,7 @@ def find_roc_metric(model):
 
 
 def find_best_model(metric_func, df_train):
-    print("Finding the best random forest model...")
+    print("Finding the best random forest model...", flush=True)
     curr_area = -100
     curr_model = None
 
@@ -141,22 +142,23 @@ def find_best_model(metric_func, df_train):
             curr_model = rf
 
         search_dict[min_impurity_decrease] = area
-        print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area))
+        print("Min impurity: %0.3f. Area under ROC curve: %0.3f." % (min_impurity_decrease, area),
+              flush=True)
 
     return curr_model
 
 
 def print_feature_importances(model, df):
-    print("Most important features:")
+    print("Most important features:", flush=True)
     print(sorted(list(zip(features(df), model.feature_importances_)), key=lambda x: x[1],
-                 reverse=True))
+                 reverse=True), flush=True)
 
 
 def customers_health(model, df):
     if 'health_score' in df:
         del df['health_score']
 
-    print("Adding health score column...")
+    print("Adding health score column...", flush=True)
     orders_latest = df.sort_values('date').groupby('id').tail(1)
     proba = model.predict_proba(orders_latest[features(df)])[:, 1]
     health_score = pd.DataFrame({'id': orders_latest['id'], 'health_score': proba})
@@ -164,9 +166,9 @@ def customers_health(model, df):
 
 
 def print_csv(df):
-    print("Printig csv...")
+    print("Printig csv...", flush=True)
     print()
-    print("id,health_score")
+    print("id,health_score", flush=True)
     print(df.groupby('id')['health_score'].max().to_csv())
 
 
@@ -178,7 +180,7 @@ def print_csv(df):
     file_name = args.file_name
 
     with zipfile.ZipFile("orders.zip", 'r') as zip_ref:
-        print("Extracting orders.zip...")
+        print("Extracting orders.zip...", flush=True)
         zip_ref.extractall('.')
 
     df = read_df(file_name)

From 12ba28e95caaf3bb05c32d72fae0fceecb316fa7 Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:36:06 +0100
Subject: [PATCH 5/7] adding orders-2 run log

---
 orders-2-run.log | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 orders-2-run.log

diff --git a/orders-2-run.log b/orders-2-run.log
new file mode 100644
index 0000000..f919653
--- /dev/null
+++ b/orders-2-run.log
@@ -0,0 +1,65 @@
+Extracting orders.zip...
+Reading orders-2.csv...
+Read in 499999 rows...
+Dropped 5 rows containing nan...
+Parsed dates...
+Dropped 1 rows containing -1 in id column...
+
+Percentiles for average days between orders for returning customers:
+count                      51720
+mean     16 days 18:23:21.946281
+std      14 days 01:22:24.347529
+min              0 days 00:00:01
+0%               0 days 00:00:01
+10%       3 days 00:41:38.900000
+20%       5 days 12:14:16.070000
+30.0%     7 days 20:25:11.034782
+40%      10 days 08:18:29.133333
+50%      13 days 01:51:06.333333
+60.0%    16 days 05:22:57.191111
+70%      20 days 05:32:30.519999
+80%      25 days 18:30:30.033333
+90%      35 days 11:43:19.660000
+max      89 days 11:15:48.500000
+Name: date, dtype: object
+
+90% of the customers that make repeated purchase make it after 35 days.
+Latest date found in the dataset: 2018-05-01 23:56:52.
+Customers who have purchases after 2018-03-27 12:13:32.340000 are considered healthy.
+Created 385972 transactions corresponding to healthy customers.
+Number of unique customers: 86207.
+Number of healthy customers: 39508.
+Proportion of healthy customers: 0.46.
+Adding secondsSinceRegistration feature...
+Adding numOfTransactions features...
+Size of training data set: 374392.
+Size of test data set: 125601.
+Proportion of transactions corresponding to healthy customers in training dataset: 0.77.
+Proportion of transactions corresponding to healthy customers in test dataset: 0.77.
+Finding the best random forest model...
+Min impurity: 0.000. Area under ROC curve: 0.738.
+Min impurity: 0.020. Area under ROC curve: 0.724.
+Min impurity: 0.040. Area under ROC curve: 0.724.
+Min impurity: 0.060. Area under ROC curve: 0.500.
+Min impurity: 0.080. Area under ROC curve: 0.500.
+Min impurity: 0.100. Area under ROC curve: 0.500.
+Min impurity: 0.120. Area under ROC curve: 0.500.
+Min impurity: 0.140. Area under ROC curve: 0.500.
+Min impurity: 0.160. Area under ROC curve: 0.500.
+Min impurity: 0.180. Area under ROC curve: 0.500.
+Min impurity: 0.200. Area under ROC curve: 0.500.
+Min impurity: 0.220. Area under ROC curve: 0.500.
+Min impurity: 0.240. Area under ROC curve: 0.500.
+Min impurity: 0.260. Area under ROC curve: 0.500.
+Min impurity: 0.280. Area under ROC curve: 0.500.
+Most important features:
+[('secondsSinceRegistration', 0.32592714760036945), ('price', 0.27353931173752621), ('profit', 0.20867555823625111), ('numOfTransactions', 0.19185798242585314)]
+Adding health score column...
+Saving full csv...
+Printig csv...
+
+id,health_score
+0000b0000ac761edbe8480f8273fdb07,0.7
+00017b2e0c5ecad4d92f6d7b7f4fa1f0,1.0
+00018653eaf26f920973426529654866,0.325
+...

From b3a8405a3c0d17eefbdff3ec398dd3e164b79f89 Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:37:33 +0100
Subject: [PATCH 6/7] adding orders-1 run log

---
 orders-1-run.log | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 orders-1-run.log

diff --git a/orders-1-run.log b/orders-1-run.log
new file mode 100644
index 0000000..f8d7dda
--- /dev/null
+++ b/orders-1-run.log
@@ -0,0 +1,67 @@
+Extracting orders.zip...
+Reading orders-1.csv...
+Read in 533623 rows...
+Dropped 65 rows containing nan...
+Renamed Customer ID to id...
+Parsed dates...
+Renamed Order Date to date...
+Renamed Price to price...
+Dropped 1411 rows containing -1 in id column...
+
+Percentiles for average days between orders for returning customers:
+count                      51501
+mean     32 days 05:31:38.178793
+std      24 days 13:12:07.576172
+min       0 days 01:42:51.428571
+0%        0 days 01:42:51.428571
+10%              8 days 06:00:00
+20%             13 days 00:00:00
+30.0%           17 days 04:00:00
+40%             21 days 12:00:00
+50%      26 days 03:25:42.857142
+60.0%           31 days 10:40:00
+70%             38 days 08:00:00
+80%             47 days 16:00:00
+90%             64 days 00:00:00
+max            179 days 00:00:00
+Name: date, dtype: object
+
+90% of the customers that make repeated purchase make it after 64 days.
+Latest date found in the dataset: 2018-06-27 00:00:00.
+Customers who have purchases after 2018-04-24 00:00:00 are considered healthy.
+Created 271183 transactions corresponding to healthy customers.
+Number of unique customers: 123468.
+Number of healthy customers: 39610.
+Proportion of healthy customers: 0.32.
+Adding secondsSinceRegistration feature...
+Adding numOfTransactions features...
+Size of training data set: 398630.
+Size of test data set: 133517.
+Proportion of transactions corresponding to healthy customers in training dataset: 0.51.
+Proportion of transactions corresponding to healthy customers in test dataset: 0.51.
+Finding the best random forest model...
+Min impurity: 0.000. Area under ROC curve: 0.764.
+Min impurity: 0.020. Area under ROC curve: 0.686.
+Min impurity: 0.040. Area under ROC curve: 0.684.
+Min impurity: 0.060. Area under ROC curve: 0.500.
+Min impurity: 0.080. Area under ROC curve: 0.500.
+Min impurity: 0.100. Area under ROC curve: 0.500.
+Min impurity: 0.120. Area under ROC curve: 0.500.
+Min impurity: 0.140. Area under ROC curve: 0.500.
+Min impurity: 0.160. Area under ROC curve: 0.500.
+Min impurity: 0.180. Area under ROC curve: 0.500.
+Min impurity: 0.200. Area under ROC curve: 0.500.
+Min impurity: 0.220. Area under ROC curve: 0.500.
+Min impurity: 0.240. Area under ROC curve: 0.500.
+Min impurity: 0.260. Area under ROC curve: 0.500.
+Min impurity: 0.280. Area under ROC curve: 0.500.
+Most important features:
+[('secondsSinceRegistration', 0.45706872620175287), ('price', 0.41632602073665953), ('numOfTransactions', 0.1266052530615876)]
+Adding health score column...
+Saving full csv...
+Printig csv...
+
+id,health_score
+1000001170897,0.2505100887996181
+1000015470766,1.0
+...

From 224013e8fba7416c0d60def115fc78ff9fe3fd8d Mon Sep 17 00:00:00 2001
From: Tom Artiom Fiodorov <a.fiodorov@yahoo.co.uk>
Date: Mon, 23 Jul 2018 01:56:37 +0100
Subject: [PATCH 7/7] better help

---
 solution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/solution.py b/solution.py
index 6131fd3..6d8ee3e 100755
--- a/solution.py
+++ b/solution.py
@@ -174,7 +174,7 @@ def print_csv(df):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Train health model.')
-    parser.add_argument('file_name', type=str, help='an integer for the accumulator')
+    parser.add_argument('file_name', type=str, help='orders-1.csv or orders-2.csv.')
 
     args = parser.parse_args()
     file_name = args.file_name