From 6a7a8f3d8681fc13b436688736fcd130e145a876 Mon Sep 17 00:00:00 2001 From: daviz Date: Sun, 28 Jun 2015 17:37:23 +0200 Subject: [PATCH 1/3] multilabel onevsrest SGDC classification --- multilabel-OneVsRest-SGDC-classification.py | 92 +++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 multilabel-OneVsRest-SGDC-classification.py diff --git a/multilabel-OneVsRest-SGDC-classification.py b/multilabel-OneVsRest-SGDC-classification.py new file mode 100644 index 0000000..24cd502 --- /dev/null +++ b/multilabel-OneVsRest-SGDC-classification.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Jun 27 12:47:40 2015 + +@author: daviz +""" + +import pandas as pd +import numpy as np + +# Load datasets +names = ["merchant_zipcode", "date", "category", "age_interval", "gender", "merchants", "cards", "payments", "avg_payment", "max_payment", "min_payment", "std"] +demo_stats = pd.read_csv("dataset/demographic_distribution000", delim_whitespace=True, + names= names, parse_dates=["date"], + dtype = {'merchant_zipcode': str}) + +demo_stats = demo_stats.loc[demo_stats["gender"] != 'unknown'] +demo_stats = demo_stats.loc[demo_stats["age_interval"] != 'unknown'] +demo_stats = demo_stats[demo_stats.category == 'es_barsandrestaurants'] + +bcn_zipcodes = ['08001', '08002', '08003', '08004', '08005', '08006', '08007', + '08008', '08009', '08010', '08011', '08012', '08013', '08014', + '08015', '08016', '08017', '08018', '08019', '08020', '08021', + '08022', '08023', '08024', '08025', '08026', '08027', '08028', + '08029', '08030', '08031', '08032', '08033', '08034', '08035', + '08036', '08037', '08038', '08039', '08040', '08041', '08042'] + +demo_stats = demo_stats[demo_stats.merchant_zipcode.apply(lambda zp: zp in bcn_zipcodes)] + +demo_stats["weekday"] = demo_stats["date"].map(lambda d: (d.weekday())) +demo_stats["weekday"] = demo_stats["weekday"].astype('string') + +demo_stats["amount"] = demo_stats["payments"] * demo_stats["avg_payment"] +demo_stats = demo_stats[demo_stats.amount < 500] +demo_stats = demo_stats[demo_stats.amount > 0] + +demo_stats["amount_level"] = pd.cut(demo_stats.amount, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) +demo_stats["amount_level"] = pd.cut(demo_stats.amount, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) +demo_stats["max_payment_level"] = pd.cut(demo_stats.max_payment, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) +demo_stats["min_payment_level"] = pd.cut(demo_stats.min_payment, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) + +demo_stats = demo_stats.groupby(["weekday","age_interval", "gender", "max_payment_level", "min_payment_level"])['merchant_zipcode'].apply(list) +demo_stats = demo_stats.reset_index() + +demo_stats['merchant_zipcode'] = demo_stats['merchant_zipcode'].map(lambda d: tuple(list(set(d)))) + +X = demo_stats[["weekday","age_interval", "gender", "max_payment_level", "min_payment_level"]] +y = demo_stats[["merchant_zipcode"]] + +# Allow to use machine learning with categorical features. +from sklearn.feature_extraction import DictVectorizer +vec = DictVectorizer() +X_vectorized = vec.fit_transform(X.to_dict("records")).toarray() + +# Multilabel classification +from sklearn.preprocessing import MultiLabelBinarizer +mlb = MultiLabelBinarizer() +y_multilabel = mlb.fit_transform(y.values.ravel()) + +# OneVsRest +from sklearn.multiclass import OneVsRestClassifier +from sklearn.linear_model import SGDClassifier + +classifier = OneVsRestClassifier( + SGDClassifier( + loss= 'hinge', + alpha=0.00001, + penalty='elasticnet')).fit(X_vectorized, y_multilabel) + + +# Predict new examples +example = {'age_interval': '35-44', + 'max_payment_level': 'low', + 'min_payment_level': 'low', + 'gender': 'male', + 'weekday': '5'} + +example_vectorized = vec.transform(example).toarray() +example_predicted = classifier.predict(example_vectorized) +print "Person that can not spend lot of money on restaurant:" +print mlb.inverse_transform(example_predicted) +print "\n\n" + +example = {'age_interval': '35-44', + 'max_payment_level': 'high', + 'min_payment_level': 'high', + 'gender': 'male', + 'weekday': '5'} +example_vectorized = vec.transform(example).toarray() +example_predicted = classifier.predict(example_vectorized) +print "Person that can spend more money on restaurant:" +print mlb.inverse_transform(example_predicted) From 8da5e80bff4c07f9ca6ac2f1aec4abd843e803a1 Mon Sep 17 00:00:00 2001 From: daviz Date: Mon, 29 Jun 2015 19:21:57 +0200 Subject: [PATCH 2/3] adding classification scoring --- multilabel-OneVsRest-SGDC-classification.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/multilabel-OneVsRest-SGDC-classification.py b/multilabel-OneVsRest-SGDC-classification.py index 24cd502..9a91976 100644 --- a/multilabel-OneVsRest-SGDC-classification.py +++ b/multilabel-OneVsRest-SGDC-classification.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np +from sklearn import cross_validation # Load datasets names = ["merchant_zipcode", "date", "category", "age_interval", "gender", "merchants", "cards", "payments", "avg_payment", "max_payment", "min_payment", "std"] @@ -34,7 +35,6 @@ demo_stats = demo_stats[demo_stats.amount < 500] demo_stats = demo_stats[demo_stats.amount > 0] -demo_stats["amount_level"] = pd.cut(demo_stats.amount, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) demo_stats["amount_level"] = pd.cut(demo_stats.amount, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) demo_stats["max_payment_level"] = pd.cut(demo_stats.max_payment, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) demo_stats["min_payment_level"] = pd.cut(demo_stats.min_payment, 7, labels = ["very-low", "low", "low-medium", "medium", "medium-high", "high", "very-high"]) @@ -47,6 +47,8 @@ X = demo_stats[["weekday","age_interval", "gender", "max_payment_level", "min_payment_level"]] y = demo_stats[["merchant_zipcode"]] + + # Allow to use machine learning with categorical features. from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() @@ -57,16 +59,24 @@ mlb = MultiLabelBinarizer() y_multilabel = mlb.fit_transform(y.values.ravel()) + +PRC = 0.2 +X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_vectorized, y_multilabel, test_size=PRC) + + # OneVsRest from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import SGDClassifier +from sklearn.metrics import classification_report classifier = OneVsRestClassifier( SGDClassifier( loss= 'hinge', alpha=0.00001, - penalty='elasticnet')).fit(X_vectorized, y_multilabel) + penalty='l2')).fit(X_train, y_train) +y_predicted= classifier.predict(X_test) +print classification_report(y_test, y_predicted) # Predict new examples example = {'age_interval': '35-44', From b0d18156ff35660e068189a75960733e2ab9c4b1 Mon Sep 17 00:00:00 2001 From: daviz Date: Tue, 30 Jun 2015 10:35:31 +0200 Subject: [PATCH 3/3] adding scoring and trying other classifier randomForest or OneVsRest with decision tree or svm linear --- multilabel-OneVsRest-SGDC-classification.py | 38 +++++++++++++-------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/multilabel-OneVsRest-SGDC-classification.py b/multilabel-OneVsRest-SGDC-classification.py index 9a91976..440e7f0 100644 --- a/multilabel-OneVsRest-SGDC-classification.py +++ b/multilabel-OneVsRest-SGDC-classification.py @@ -48,7 +48,6 @@ y = demo_stats[["merchant_zipcode"]] - # Allow to use machine learning with categorical features. from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() @@ -63,27 +62,36 @@ PRC = 0.2 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_vectorized, y_multilabel, test_size=PRC) - # OneVsRest from sklearn.multiclass import OneVsRestClassifier -from sklearn.linear_model import SGDClassifier from sklearn.metrics import classification_report -classifier = OneVsRestClassifier( - SGDClassifier( - loss= 'hinge', - alpha=0.00001, - penalty='l2')).fit(X_train, y_train) +from sklearn import tree +classifier = OneVsRestClassifier(tree.DecisionTreeClassifier(criterion="entropy", + max_features="log2")) +#from sklearn import ensemble +#classifier = ensemble.RandomForestClassifier(criterion="entropy", +# max_features="log2", +# n_estimators= 500) + +#from sklearn import svm +#classifier = OneVsRestClassifier(svm.SVC(kernel='linear')) -y_predicted= classifier.predict(X_test) +classifier.fit(X_train, y_train) + +y_predicted = classifier.predict(X_test) + +# Interpret carefully since it is a multilabel classification print classification_report(y_test, y_predicted) # Predict new examples -example = {'age_interval': '35-44', +example = { + 'age_interval': '35-44', 'max_payment_level': 'low', 'min_payment_level': 'low', - 'gender': 'male', - 'weekday': '5'} + 'gender': 'female', + 'weekday': '5' + } example_vectorized = vec.transform(example).toarray() example_predicted = classifier.predict(example_vectorized) @@ -91,11 +99,13 @@ print mlb.inverse_transform(example_predicted) print "\n\n" -example = {'age_interval': '35-44', +example = { + 'age_interval': '45-54', 'max_payment_level': 'high', 'min_payment_level': 'high', 'gender': 'male', - 'weekday': '5'} + 'weekday': '5' + } example_vectorized = vec.transform(example).toarray() example_predicted = classifier.predict(example_vectorized) print "Person that can spend more money on restaurant:"