diff --git a/neighborhood_model.ipynb b/neighborhood_model.ipynb new file mode 100644 index 0000000..5427cdc --- /dev/null +++ b/neighborhood_model.ipynb @@ -0,0 +1,539 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recommedation System with Neighborhood Model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the module neighborhood_model. See neighborhood_model.py\n", + "1. Read in the user-song pair data. \n", + "2. Tune the hyperparameter for the neighborhood model. \n", + "3. Make recommendation based on the model. The recommendation can be made for both user in the read-in data and out.\n", + "4. Showcase for the recommnedation system. Randomly select 10 users. Show the songs they have listened before and the songs we recommend to them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from neighborhood_model import *\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.sparse import coo_matrix\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import normalize\n", + "from scipy.sparse import lil_matrix\n", + "from scipy.sparse import csr_matrix\n", + "from scipy.sparse import load_npz\n", + "import scipy.sparse as sp\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "def load_data(filename):\n", + " df = pd.read_table(filename, sep='\\t', names = ['user_id','song_id','playcount'])\n", + " n_users = df.user_id.unique().shape[0] \n", + " n_songs = df.song_id.unique().shape[0]\n", + " ratings = np.zeros((n_users, n_songs))\n", + " df['user_id'] = df['user_id'].astype('category')\n", + " df['song_id'] = df['song_id'].astype('category')\n", + " parsed_matrix = coo_matrix((df['playcount'].astype(float),(df['user_id'].cat.codes, df['song_id'].cat.codes))).tolil()\n", + " return parsed_matrix\n", + "\n", + "#obtain a subset of the data\n", + "def data_sampling(data, num_users,num_songs):\n", + " num_instances, num_features = data.shape[0], data.shape[1]\n", + " #sample users from the data\n", + " sample_user_index = np.random.choice(num_instances, num_users, replace=False)\n", + " #sample songs from the data\n", + " sample_song_index = np.random.choice(num_features, num_songs, replace=False)\n", + " samples = data[sample_user_index,:]\n", + " samples = samples[:, sample_song_index,]\n", + " return samples\n", + "\n", + "#compute the inverse user frequency of the data reduce weights for commonly occurring songs\n", + "def ivf(x):\n", + " num_user = x.shape[0]\n", + " binary_data = x.copy()\n", + " binary_data[x != 0] = 1\n", + " nj = np.asarray(binary_data.sum(axis = 0)).squeeze()\n", + "\n", + " fj =np.log(num_user/nj)\n", + " for i in range(num_user):\n", + " x[i,:] = x[i,:].multiply(fj)\n", + " return x\n", + "\n", + "#produce the inverse user frequency feature if needed, and normalize the data\n", + "def prep2(delete_user_song, if_ivf):\n", + " if if_ivf == 1:\n", + " #compute ivf of delete_user_song_ivf\n", + " delete_user_song = ivf(delete_user_song)\n", + " user_song_normalized = lil_matrix(normalize(delete_user_song, axis=1),dtype = np.float64)\n", + " return user_song_normalized\n", + "\n", + "def train_test_split(data, size):\n", + "#data should be ndarray format\n", + " test = np.zeros(data.shape)\n", + " train = data.copy()\n", + " for user in range(data.shape[0]):\n", + " test_index = np.random.choice(data[user, :].nonzero()[0], \n", + " size=size, \n", + " replace=False)\n", + " train[user, test_index] = 0.\n", + " test[user, test_index] = data[user, test_index]\n", + " # Test and training are truly disjoint\n", + " assert(np.all((train * test) == 0)) \n", + " train = lil_matrix(train,dtype = np.float64)\n", + " test = lil_matrix(test,dtype = np.float64)\n", + " return train, test\n", + "\n", + "def main_2(num_users, num_songs, thres, rho, user_id, num_recommend, if_ivf = 0):\n", + " user_song_matrix = load_npz('./sparse_matrix.npz')\n", + " raw_data = data_sampling(user_song_matrix, num_users, num_songs)\n", + " data = prep(raw_data,thres)\n", + " model = NeighborhoodModel(rho)\n", + " model.fit(data)\n", + " #print(model.recommend(user_id,num_recommend))\n", + " return model.evaluate()\n", + "\n", + "def main(data, train, test, thres, rho, user_id, num_recommend, if_ivf = 0):\n", + " model = NeighborhoodModel(data, train, test)\n", + " model.fit(rho)\n", + " print(model.recommend(user_id,num_recommend))\n", + " print(model.evaluate())\n", + " user_pref = np.zeros((1, data.shape[1]))\n", + " user_pref[0,0], user_pref[0,22], user_pref[0,222], user_pref[0,2222] = 2, 3, 5, 6\n", + " print(model.recommend_ind(user_pref, 3))\n", + " return 1\n", + "\n", + "def run(train, test, rho):\n", + " model = NeighborhoodModel()\n", + " model.fit(train, test, rho)\n", + " return model.evaluate()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "f = './sample_matrix.npz'\n", + "#read in the data produced by preprocessing.py\n", + "raw_data = sp.load_npz(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parameter Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import operator\n", + "\n", + "def parameter_tuning(raw_data, test_size, rho_range = [1,1.5, 1.999, 2.5, 3]):\n", + " '''\n", + " Tune two parameter: \n", + " if_ivf: whether to use Inverse User Frequency or original data feature\n", + " rho: case amplificatio parameter\n", + " Inputs:\n", + " - raw_data: a sparse matrix of size(#users, #songs)\n", + " - test_size: scalar, percent of obs used for testing\n", + " - rho_range: list of rhos used for parameter tuning\n", + " Output:\n", + " - best_para: tuple of best parameter\n", + " '''\n", + " result = {}\n", + " for if_ivf in [0,1]:\n", + " data = prep2(raw_data.copy(), if_ivf = if_ivf)\n", + " train, test = train_test_split(data.toarray(), test_size)\n", + " for rho in [1,1.5, 1.999, 2.5, 3]:\n", + " result[(rho,if_ivf)] = run(train, test, rho)\n", + "\n", + " for key in result:\n", + " print(str(key) + ': ' + str(result[key]))\n", + " best_para = min(result.items(), key=operator.itemgetter(1))[0]\n", + " print('best parameter is rho = ' + str(best_para[0]) + ' if_ivf = ' + str(best_para[1]))\n", + " return best_para" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/hp/anaconda/lib/python3.5/site-packages/scipy/sparse/data.py:111: RuntimeWarning: invalid value encountered in power\n", + " return self._with_data(data ** n)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2.5, 0): 16.3655613746\n", + "(3, 1): 15.7523197962\n", + "(2.5, 1): 24.4495429034\n", + "(1.5, 0): 16.8063741018\n", + "(1.999, 1): 24.3087870156\n", + "(1.999, 0): 16.4543514092\n", + "(1.5, 1): 24.3728337994\n", + "(1, 0): 14.865758076\n", + "(3, 0): 13.3545845664\n", + "(1, 1): 16.5995817516\n", + "best parameter is rho = 3 if_ivf = 0\n" + ] + } + ], + "source": [ + "best_para = parameter_tuning(raw_data, test_size = 20,rho_range = [2.5, 3,4,5,10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Recommendation system" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def recommend(raw_data, best_para, num_rec, if_outside_user = 0, user_id = None, user_pref = None):\n", + " '''\n", + " Recommend songs using neighborhood models, produce song_ids for given user_id.\n", + " Inputs:\n", + " - raw_data: a sparse matrix of size(#users, #songs)\n", + " - best_para: tuple of best parameter\n", + " - if_outside_user: logical parameter that indicates whether the user is in the read-in data\n", + " - id of the user\n", + " - user_pref: if the user is not in the read-in data, this parameter is a list \n", + " of # of times of each song that the user has listened to (the same format in the read-in data)\n", + " Output:\n", + " - rec_song: a list of song_ids generated by the neighborhood model.\n", + " '''\n", + " rho, if_ivf= 1.5, best_para[1]\n", + " data = prep2(raw_data, if_ivf = if_ivf)\n", + " model = NeighborhoodModel()\n", + " if if_outside_user == 0:\n", + " rec_song = model.recommend(data,user_id = user_id, num_rec = num_rec)\n", + " else:\n", + " \n", + " rec_song = model.recommend_out(data, user_pref, num_rec = num_rec)\n", + " return rec_song" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[6962]" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommend(raw_data, best_para, if_outside_user = 0, num_rec = 1, user_id = 23)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Recommendation system showcase" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def top_k_song(raw_data, user_id, k):\n", + " # get the most listened k song ids for the specific user \n", + " song_arr = np.asarray(raw_data[user_id, :].todense()).squeeze()\n", + " topksong = np.argsort(song_arr)[-k:]\n", + " return topksong" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#select 10 random users\n", + "user_ids = np.random.choice(num_instances, 10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "num_user_rec = 10\n", + "rec_dict = {}\n", + "read_dict = np.load('song_index_dictionary.npy').item()\n", + "\n", + "for user_id in user_ids:\n", + " # the list of song ids generated by the recommendation system\n", + " rec_song = np.array([read_dict[x] for x in recommend(raw_data, best_para, if_outside_user = 0, num_rec = 3, user_id = user_id)])\n", + " #the list of 10 song ids that the user listened to most frequently.\n", + " topksong = np.array([read_dict[x] for x in top_k_song(raw_data, user_id, 10)])\n", + " rec_song_name = []\n", + " topksong_name = []\n", + " # convert song ids to the name of the songs using unique_tracks.txt for both rec_song and topksong\n", + " # store the song name for each user in dictionary rec_dict, key is the user_id, \n", + " # value is a list of song names and artists\n", + " for i in range(rec_song.shape[0]):\n", + " song_id = rec_song[i]\n", + " song_name = None\n", + " searchfile = open(\"unique_tracks.txt\", \"r\")\n", + " for line in searchfile:\n", + " if song_id in line:\n", + " song_arr = line.rsplit('', 2)\n", + " song_name = song_arr[2].rstrip()\n", + " artist_name = song_arr[1].rstrip()\n", + " rec_song_name.append((song_name,artist_name))\n", + " searchfile.close()\n", + " for i in range(topksong.shape[0]):\n", + " song_id = topksong[i]\n", + " song_name = None\n", + " searchfile = open(\"unique_tracks.txt\", \"r\")\n", + " for line in searchfile:\n", + " if song_id in line: \n", + " song_arr = line.rsplit('', 2)\n", + " song_name = song_arr[2].rstrip()\n", + " artist_name = song_arr[1].rstrip()\n", + " topksong_name.append((song_name,artist_name))\n", + " searchfile.close()\n", + " rec_dict[user_id] = (topksong_name, rec_song_name)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{279: ([('Prête A Porter', 'Paris Combo'),\n", + " ('Invocation: Attica Blues', 'Archie Shepp / William Kunstler'),\n", + " (\"Where You'll Find Me Now\", 'Neutral Milk Hotel'),\n", + " ('Under The Gun', 'The Killers'),\n", + " ('Anthems For a Seventeen Year-Old Girl', 'Broken Social Scene'),\n", + " ('Angry Chair', 'Alice In Chains'),\n", + " ('Would You Go With Me', 'Josh Turner'),\n", + " ('Spilt Needles (Album)', 'The Shins'),\n", + " ('Comet Course', 'Flying Lotus'),\n", + " ('Jeane', 'The Smiths')],\n", + " [('Scream', 'Michael Jackson'),\n", + " ('Window Blues', 'Lykke Li'),\n", + " ('I Believe In A Thing Called Love', 'The Darkness')]),\n", + " 854: ([(\"It's My Party\", 'Lesley Gore'),\n", + " ('People', 'Journey'),\n", + " ('When A Man Loves A Woman', 'Percy Sledge'),\n", + " ('Window Blues', 'Lykke Li'),\n", + " ('Ride For You (Album Version)', 'Danity Kane'),\n", + " (\"Un-thinkable (I'm Ready)\", 'Alicia Keys'),\n", + " ('The Wild Boys', 'Duran Duran'),\n", + " ('Secret Hell', 'dEUS'),\n", + " (\"Things I Don't Understand\", 'Coldplay'),\n", + " ('So Glad To See You', 'Hot Chip')],\n", + " [('Jerry Was A Race Car Driver', 'Primus'),\n", + " ('Neon', 'John Mayer'),\n", + " ('More Than Everything', 'Gareth Emery')]),\n", + " 986: ([('Ego', 'Beyoncé'),\n", + " ('California One / Youth and Beauty Brigade', 'The Decemberists'),\n", + " ('The Mask (Featuring Ghostface Killah) (Album Version)', 'Danger Doom'),\n", + " ('Sei Lá Mangueira', 'Elizeth Cardoso'),\n", + " ('Times Like These', 'Jack Johnson'),\n", + " ('Never Ending Math Equation', 'Modest Mouse'),\n", + " ('Hallowed Be My Name', 'HAMMERFALL'),\n", + " ('Sit Down. Stand Up', 'Radiohead'),\n", + " ('But Tonight We Dance', 'Rise Against'),\n", + " ('Last Night On Earth [feat. Green Day & The Cast Of American Idiot] (Album Version)',\n", + " 'Green Day')],\n", + " [('Wait', 'Alexi Murdoch'),\n", + " ('Country Road', 'James Taylor'),\n", + " ('Tabaco Y Chanel', 'Bacilos')]),\n", + " 992: ([('Do You Wanna', 'The Kooks'),\n", + " ('The Fake Headlines', 'The New Pornographers'),\n", + " ('Murder The Government', 'NOFX'),\n", + " ('Let Me', 'Rihanna'),\n", + " ('Middle Man', 'Jack Johnson'),\n", + " ('Stilettos', 'Holy Fuck'),\n", + " ('Made For You', 'OneRepublic'),\n", + " ('Sex In Secret', 'Cabaret Voltaire'),\n", + " ('Jazz Street', 'Jaco Pastorius_ Brian Melvin'),\n", + " ('This Is Nowhere', 'The Airborne Toxic Event')],\n", + " [('Jerry Was A Race Car Driver', 'Primus'),\n", + " ('Neon', 'John Mayer'),\n", + " ('Ego', 'Beyoncé')]),\n", + " 1722: ([('Contra La Corriente', 'Marc Anthony'),\n", + " ('Hallowed Be My Name', 'HAMMERFALL'),\n", + " ('Ego', 'Beyoncé'),\n", + " ('Daughter', 'Bassholes'),\n", + " (\"Soon We'll Be Found\", 'Sia'),\n", + " ('(iii)', 'The Gerbils'),\n", + " ('The General Specific (Album)', 'Band Of Horses'),\n", + " (\"I'm Done\", 'The Pussycat Dolls'),\n", + " ('Diamonds From Sierra Leone', 'Kanye West / Jay-Z'),\n", + " ('God Put A Smile Upon Your Face', 'Coldplay')],\n", + " [('New Direction (Original Version)', 'Echo And The Bunnymen'),\n", + " ('Welcome To Hollywood', 'Beyoncé feat. Jay-Z'),\n", + " (\"Can't Help But Wait (Album Version)\", 'Trey Songz')]),\n", + " 1997: ([(\"Everything's Magic\", 'Angels and Airwaves'),\n", + " ('A Beautiful Mine', 'RJD2'),\n", + " ('Esisti Tu', 'Valerio Scanu'),\n", + " ('Where The White Boys Dance', 'The Killers'),\n", + " ('Itkupilli (2001 Digital Remaster)', 'Neljä Ruusua'),\n", + " ('22', 'Lily Allen'),\n", + " ('Puto', 'Molotov'),\n", + " ('Proud Mary', 'Creedence Clearwater Revival'),\n", + " ('The KKK Took My Baby Away (LP Version )', 'Ramones'),\n", + " ('The Slow Descent Into Alcoholism', 'The New Pornographers')],\n", + " [('Winter Song', 'Sara Bareilles'),\n", + " ('More Than Everything', 'Gareth Emery'),\n", + " ('New Direction (Original Version)', 'Echo And The Bunnymen')]),\n", + " 2937: ([('Sincerité Et Jalousie', 'Alliance Ethnik'),\n", + " ('A Rush Of Blood To The Head', 'Coldplay'),\n", + " ('Clint Eastwood (Ed Case/Sweetie Irie Refix) (Edit)', 'Gorillaz'),\n", + " ('The Caterpillar', 'The Cure'),\n", + " ('De Weg', 'Guus Meeuwis'),\n", + " ('Ballad Of A Comeback Kid', 'The New Pornographers'),\n", + " ('I Found A Whistle', 'MGMT'),\n", + " ('Ragoo', 'Kings Of Leon'),\n", + " ('One Thing', 'Finger Eleven'),\n", + " ('Winter Song', 'Sara Bareilles')],\n", + " [('Rebirth of Slick (Cool Like Dat) (2005 Digital Remaster) (Explicit)',\n", + " 'Digable Planets'),\n", + " (\"Still Don't Give A Fuck\", 'Eminem'),\n", + " ('Slam', 'Pendulum')]),\n", + " 3038: ([('Zero', 'Yeah Yeah Yeahs'),\n", + " ('I Thought I Saw Your Face Today', 'She & Him'),\n", + " ('Crossfire', 'Rick Cua'),\n", + " ('So Long', 'Rilo Kiley'),\n", + " ('Rayando el sol', 'Maná'),\n", + " ('Escapémonos', 'Marc Anthony;Jennifer Lopez'),\n", + " ('If It Means A Lot To You', 'A Day To Remember'),\n", + " (\"Everything's Ruined\", 'Faith No More'),\n", + " ('Pop Champagne', 'Jim Jones & Ron Browz featuring Juelz Santana'),\n", + " ('Firestarter', 'The Prodigy')],\n", + " [('What Else Is There?', 'Röyksopp'),\n", + " ('Circling', 'Four Tet'),\n", + " ('Le Corps de Notre Seigneur', 'Choeur Arménien de Sofia')]),\n", + " 3099: ([('Tiger Feet', 'Mud'),\n", + " ('Girl Money', 'Kix'),\n", + " ('Zebra', 'Beach House'),\n", + " ('A Day Without Me', 'U2'),\n", + " ('Lady Picture Show (LP Version)', 'Stone Temple Pilots'),\n", + " ('Black Horse And The Cherry Tree (Radio Version)', 'KT Tunstall'),\n", + " ('Making Time', 'The Creation'),\n", + " ('Seven Nation Army (Album Version)', 'The White Stripes'),\n", + " ('Breadfan', 'Metallica'),\n", + " (\"Don't Look Back\", 'Boston')],\n", + " [('More Than Everything', 'Gareth Emery'),\n", + " ('Hallowed Be My Name', 'HAMMERFALL'),\n", + " ('Ego', 'Beyoncé')]),\n", + " 3117: ([('Hasta Ayer', 'Marc Anthony'),\n", + " ('I Kissed A Girl', 'Katy Perry'),\n", + " ('Breathe . Something/Stellar STar', 'Flying Lotus'),\n", + " ('Saturdays', 'Cut Copy'),\n", + " ('Crackers and Cheese', 'Tea Leaf Green'),\n", + " ('California One / Youth and Beauty Brigade', 'The Decemberists'),\n", + " ('Times Like These', 'Foo Fighters'),\n", + " (\"Don't Stop The Music\", 'Jamie Cullum'),\n", + " ('Again I Go Unnoticed', 'Dashboard Confessional'),\n", + " (\"I'm Just A Man\", 'Jason Aldean')],\n", + " [('Hallowed Be My Name', 'HAMMERFALL'),\n", + " ('Ego', 'Beyoncé'),\n", + " ('Kalopsia', 'The Blizzard')])}" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# key is the user_id, \n", + "# value is a list of song names and artists\n", + "# first list of the values are the song names of the past listening hitory\n", + "# second list of the values are the song name of the \n", + "rec_dict" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:anaconda]", + "language": "python", + "name": "conda-env-anaconda-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/neighborhood_model.py b/neighborhood_model.py new file mode 100644 index 0000000..9d61561 --- /dev/null +++ b/neighborhood_model.py @@ -0,0 +1,108 @@ +import numpy as np +import pandas as pd +from scipy.sparse import coo_matrix +import matplotlib.pyplot as plt +from sklearn.preprocessing import normalize +from scipy.sparse import lil_matrix + + +class NeighborhoodModel(object): + + def __init__(self): + + self.pred_ranks_percentile = None + self.pred_ranks = None + self.num_instances_train, self.num_features_train = None, None + #case amplification factor + + def fit(self, train, test, rho): + self.train = train + self.test = test + pred = self.predict(self.train, rho) + self.pred_ranks, self.pred_ranks_percentile = self.ranking(pred) + + #output the predicted score for each item for each user + def predict(self, x, rho): + v_bar = lil_matrix(x.sum(axis = 1)) + weight = (x.dot(x.T)).multiply(v_bar.dot(v_bar.T).power(-1/2)).power(rho) + + for i in range(weight.shape[0]): + weight[i,i] = 0 + pred = weight.dot(x).todense() + return pred + + def predict_ind(self, x, user_pref_sparse): + v_bar = lil_matrix(x.sum(axis = 1)) + weight = (user_pref_sparse.dot(x.T)).multiply(v_bar.power(-1/2)) + for i in range(weight.shape[0]): + weight[i,i] = 0 + pred = weight.dot(x).todense() + return pred + + #produce the ranking percentile for each item for each user + def ranking(self, pred): + num_instances_train, num_features_train = pred.shape[0], pred.shape[1] + temp = pred.argsort(axis = 1) + #produce the abosulte ranks for each item for each user + pred_ranks = np.empty_like(temp) + for i in range(num_instances_train): + pred_ranks[i,temp[i,:]] = np.arange(num_features_train - 1, -1, -1) + #convert the ranks to rank percentile + pred_ranks_percentile = pred_ranks / np.max(pred_ranks) * 100 + return pred_ranks, pred_ranks_percentile + + #output expected percentile ranking of a watching unit + def evaluate(self): + test = self.test + + num_instances_train, num_features_train = self.num_instances_train, self.num_features_train + pred_ranks_percentile = self.pred_ranks_percentile + test = test.todense() + metrics = np.sum(np.multiply(test, pred_ranks_percentile))/np.sum(test) + return metrics + + #recommend the top "num_rec" songs to user "user_id" + def recommend(self, data, user_id,rho = 1, num_rec = 3): + pred = self.predict(data, rho) + pred_ranks, pred_ranks_percentile = self.ranking(pred) + song_rank_list = np.asarray(pred_ranks[user_id,:]).squeeze() + #produce the song list sorted by their scores + rank_index = np.argsort(song_rank_list) + rec_list = [] + num = 0 + song_arr = np.asarray(data[user_id,:].todense()).squeeze() + #songs that the user has already listened + song_in_bucket = np.nonzero(song_arr)[0] + for item in rank_index: + if num >= num_rec: + break + #exclude the songs that the user has already listened + if item not in song_in_bucket: + rec_list.append(item) + num += 1 + + return rec_list + +#recommend songs for a user not in the data +#input a array of the times of the songs that the user has listened + def recommend_out(self, data, user_pref, num_rec = 3): + user_pref_sparse = lil_matrix(user_pref, dtype = np.float64) + #similarity_ind = user_pref_sparse.dot(data.T) + #pred = similarity_ind.dot(data).todense() + pred = self.predict_ind(data, user_pref_sparse)[0] + song_rank_list, _ = np.asarray(self.ranking(pred)).squeeze() + rank_index = np.argsort(song_rank_list) + rec_list = [] + num = 0 + song_arr = np.asarray(user_pref).squeeze() + #songs that the user has already listened + song_in_bucket = np.nonzero(song_arr)[0] + for item in rank_index: + if num >= num_rec: + break + #exclude the songs that the user has already listened + if item not in song_in_bucket: + rec_list.append(item) + num += 1 + return rec_list + diff --git a/song_index_dictionary.npy b/song_index_dictionary.npy new file mode 100755 index 0000000..2e1171e Binary files /dev/null and b/song_index_dictionary.npy differ