推荐系统设计
Posted 每日学习不落下
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了推荐系统设计相关的知识,希望对你有一定的参考价值。
‘’做了不一定有好的结果,不做就一定没有,机器算法的实现,就需要不停进行理论与仿真的尝试‘’
在实现推荐系统时,采用Surprise库进行解决,主要原因在于,Surprise库封装了很有推荐算法:Baseline algorithms、neighborhood methods,matrix factorization-based
本次仿真以电影推荐为例子,进行引出网易云音乐设计,其实在网易云数据处理中由于电脑原因,并没有跑15G的数据文件,而是直接使用大神的数据分析后的文件,但是对于音乐的处理在文末会进行讲解。
本次将展现程序:
电影推荐
from surprise import KNNWithMeans
from surprise import Dataset, accuracy
from surprise.model_selection import cross_validate, KFold
接下来使用KFold和cross_validate进行交叉验证,其实效果差不多
#导入数据集
data = Dataset.load_builtin('ml-100k')
#这里导入的数据集是通过下载,其实可以在这里进行下载,需要等待
#K折交叉验证(提高模型准确度)
KF = KFold(n_splits=3)
model_KNN = KNNWithMeans()
for trainset, testset in KF.split(data):
model_KNN.fit(trainset)
pre = model_KNN.test(testset)
accuracy.rmse(pre,verbose=True)
Computing the msd similarity matrix... |
#直接使用cross_validate 其默认KFold=4
model_KNN_1 = KNNWithMeans()
pref = cross_validate(model_KNN_1, data, verbose=True, measures=['RMSE', 'MAE'])
Computing the msd similarity matrix... |
在此我们进行查看数据形式
196 242 3 881250949 186 302 3 891717742 22 377 1 878887116 244 51 2 880606923 166 346 1 886397596 298 474 4 884182806 115 265 2 881171488 253 465 5 891628467 user_id item_id scores timestamp |
其实对于推荐系统的设计,最重要的就是scores的计算,通过数据,我们计算数据之间的平分,这是非常重要的,直接影响推荐的效果。
#item_based,基于item进行推荐
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os, io
from surprise import KNNBaseline
from surprise import Dataset
#因为我们需要根据电影名获取对应的电源ID,通过电影ID得多电影名(信息)
def read_item_names():
file_name = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.item')
rid_to_name = {}
name_to_rid = {}
with io.open(file_name,'r', encoding='ISO-8859-1') as f:
#值得注意的是,这里使用utf8无法进行编译
for line in f:
line = line.split('|')
rid_to_name[line[0]] = [line[1],line[2]]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
# 计算相互之间的相似度
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name':'pearson_baseline', 'user_based': False}
model_KNNB = KNNBaseline(sim_options = sim_options)
model_KNNB.fit(trainset)
#获取电影信息到电影ID和电影ID到电影信息的映射
rid_to_name, name_to_rid = read_item_names()
#直接对其进行预测处理
#raw_id---->inner_id
test_raw_id = name_to_rid['Desperado (1995)']
test_inner_id = model_KNNB.trainset.to_inner_iid(test_raw_id)
print(test_raw_id)
print(test_inner_id)
#inner_id ---->neighbors inner_id
test_neighbors = model_KNNB.get_neighbors(test_inner_id, k=10)
print(test_neighbors)
#neighbors inner_id ---->row_id ---->item_info
res = []
print('推荐的信息如下(10)')
for i in test_neighbors:
row_id = model_KNNB.trainset.to_raw_iid(i)
print(rid_to_name[row_id])
res.append(rid_to_name[row_id])
推荐的信息如下(10) |
电影的推荐是系统自带的数据进行仿真的。
音乐推荐
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io
from surprise import KNNBaseline, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
import pickle
#用来读取数据(因为电脑原因,就直接使用大神的数据进行仿真,其实推荐系统的设计最重要的就是数据的处理)
id_name_dic = pickle.load(open('./recommendation_system_codes/popular_playlist.pkl','rb'), encoding='utf8')
name_id_dic = {}
for playlist_id in id_name_dic:
name_id_dic[id_name_dic[playlist_id]] = playlist_id
#载入数据
file_path = os.path.expanduser('./recommendation_system_codes/popular_music_suprise_format.txt')
reader = Reader(line_format = 'user item rating timestamp', sep=',')
music_data = Dataset.load_from_file(file_path, reader=reader)
#构建数据集
trainset = music_data.build_full_trainset()
#这里是基于user_base
sim_options = {'user_based': True}
model_KNNB = KNNBaseline(sim_options=sim_options)
# cross_validate(model_KNNB, music_data, verbose=True, measures=['RMSE', 'MAE'])
model_KNNB.fit(trainset)
current_playlist = list(name_id_dic.keys())[56]
print('歌单名:', current_playlist)
歌单名:专属你的周杰伦 |
#找出相邻的是个歌单(这里可能不准,主要原因在于评价分数的设置不准)
#name ---> id
current_playlist_id = name_id_dic[current_playlist]
#raw_id ---> inner_id
current_playlist_inner_id = model_KNNB.trainset.to_inner_uid(current_playlist_id)
#inner_id ----> find neighbors inner_id
neighbors_inner_id = model_KNNB.get_neighbors(current_playlist_inner_id, k=10)
#inner_id ---->raw_id
neighbors_raw_id = [model_KNNB.trainset.to_raw_uid(inner_id) for inner_id in neighbors_inner_id]
#raw_id ---> music_list name
playlist_name = [id_name_dic[raw_id] for raw_id in neighbors_raw_id]
for name in playlist_name:
print(name)
当过千评论的华语翻唱遇上“原唱”【更新】 |
针对歌曲进行推荐
import pickle
#读取数据 song_id
song_id_name_dic = pickle.load(open('./recommendation_system_codes/popular_song.pkl','rb'), encoding='utf8')
song_name_id_dic = {}
for song_id in song_id_name_dic:
song_name_id_dic[song_id_name_dic[song_id]] = song_id
user_inner_id = 56
#这里我们直接给出user,在user进行推荐,其实这是不准确的,主要原因在数scores的设定
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
print(model_KNNB.predict(user_inner_id, song, r_ui=1),song_id_name_dic[model_KNNB.trainset.to_raw_iid(song)])
利用矩阵分解进行预测 |
from surprise import NMF
from surprise import Dataset
file_path = os.path.expanduser('./recommendation_system_codes/popular_music_suprise_format.txt')
reader = Reader(line_format='user item rating timestamp', sep=',')
music_data = Dataset.load_from_file(file_path, reader=reader)
model_NMF = NMF()
trainset = music_data.build_full_trainset()
model_NMF.fit(trainset)
user_raw_id = name_id_dic['有没有一首歌让你泪流满面']
user_inner_id = model_NMF.trainset.to_inner_uid(user_raw_id)
user_rating = trainset.ur[user_inner_id]
items = map(lambda x:x[0], user_rating)
for song in items:
print(model_NMF.predict(model_NMF.trainset.to_raw_uid(user_inner_id), model_NMF.trainset.to_raw_iid(song), r_ui=1), song_id_name_dic[model_NMF.trainset.to_raw_iid(song)])
user: 89622111 item: 423703717 r_ui = 1.00 est = 1.00 {'was_impossible': False} 我等你到35岁 希小白 |
#模型的存储
import surprise
surprise.dump.dump('./wangyiRecommedation.model', algo=model_NMF)
#使用NormalPredictor
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
model_NP = NormalPredictor()
perf = cross_validate(model_NP, music_data, measures=['RMSE', 'MAE'])
{'test_rmse': array([0., 0., 0., 0., 0.]),
'test_mae': array([0., 0., 0., 0., 0.]),
'fit_time': (0.2260129451751709,
0.23301315307617188,
0.22701287269592285,
0.22501301765441895,
0.23001313209533691),
'test_time': (2.9311678409576416,
0.2930169105529785,
0.2950170040130615,
0.29201674461364746,
0.4540259838104248)}
### 使用BaselineOnly
from surprise import BaselineOnly
from surprise.model_selection import cross_validate
model_BL = BaselineOnly()
perf = cross_validate(model_BL, music_data, measures=['RMSE', 'MAE'])
{'test_rmse': array([0., 0., 0., 0., 0.]), |
# 基于基础版协同过滤
from surprise import KNNBasic
from surprise.model_selection import cross_validate
model_KNNB = KNNBasic()
cross_validate(model_KNNB, music_data, measures=['RMSE', 'MAE'])
# 使用均值协同过滤
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
model_KN = KNNWithMeans()
cross_validate(model_KN, music_data, measures=['RMSE', 'MAE'])
# 使用均值协同过滤
from surprise import KNNBaseline
from surprise.model_selection import cross_validate
model_KB = KNNBaseline()
cross_validate(model_KB, music_data, measures=['RMSE', 'MAE'])
### 使用SVD
from surprise import SVD
from surprise.model_selection import cross_validate
model_SVD = SVD()
cross_validate(model_SVD, music_data, measures=['RMSE', 'MAE'])
### 使用SVD++
from surprise import SVDpp
from surprise.model_selection import cross_validate
model_SVDpp = SVDpp()
cross_validate(model_SVDpp, music_data, measures=['RMSE', 'MAE'])
### 使用SVD++
from surprise import NMF
from surprise.model_selection import cross_validate
model_NMF = NMF()
cross_validate(model_NMF, music_data, measures=['RMSE', 'MAE'])
以上是关于推荐系统设计的主要内容,如果未能解决你的问题,请参考以下文章