数据挖掘之协同过滤
Posted similarface
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了数据挖掘之协同过滤相关的知识,希望对你有一定的参考价值。
# coding:utf-8 __author__ = ‘similarface‘ #datalink=http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip ‘‘‘ BX-Users["User-ID";"Location";"Age"] BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"] BX-Book-Ratings["User-ID";"ISBN";"Book-Rating"] ‘‘‘ #专门用作编码转换 import codecs, os, sys from math import sqrt users = { "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } class recommender: def __init__(self, data, k=1, metric=‘pearson‘, n=5): self.k = k self.n = n self.username2id = {} self.userid2name = {} self.productid2name = {} self.metric = metric if self.metric == ‘pearson‘: self.fn = self.pearson if type(data).__name__ == ‘dict‘: self.data = data def loadBookDB(self, path=‘‘): self.data = {} i = 0 #读取用户评分书籍的数据 f = codecs.open(os.path.join(path, ‘BX-Book-Ratings.csv‘), ‘r‘, ‘utf-8‘,errors=‘ignore‘) for line in f: i = i + 1 fields = line.split(‘;‘) user = fields[0].strip(‘"‘) book = fields[1].strip(‘"‘) try: rating = int(fields[2].strip().strip(‘"‘)) except ValueError: continue if user in self.data: currentRatings = self.data[user] else: currentRatings = {} currentRatings[book] = rating self.data[user] = currentRatings f.close() #读取书籍的信息 f = codecs.open(os.path.join(path, ‘BX-Books.csv‘), ‘r‘, ‘utf8‘,errors=‘ignore‘) for line in f: i += 1 fields = line.split(‘;‘) #BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"] isbn = fields[0].strip(‘"‘) title = fields[1].strip(‘"‘) author = fields[2].strip(‘"‘) title = title + ‘by‘ + author self.productid2name[isbn] = title f.close() #读取用户的信息 f = codecs.open(os.path.join(path, ‘BX-Users.csv‘), ‘r‘, ‘utf8‘,errors=‘ignore‘) for line in f: i += 1 fields = line.split(‘;‘) userid = fields[0].strip(‘"‘) location = fields[1].strip(‘"‘) if len(fields) > 3: age = fields[2].strip().strip(‘"‘) else: age = ‘NULL‘ if age != ‘NULL‘: value = location + ‘ (age: ‘ + age + ‘)‘ else: value = location self.userid2name[userid] = value self.username2id[location] = userid f.close() print(i) def pearson(self, rating1, rating2): ‘‘‘ 皮尔逊相关参数 在统计学中,皮尔逊积矩相关系数 (英语:Pearson product-moment correlation coefficient, 又称作 PPMCC或PCCs[1], 文章中常用r或Pearson‘s r表示) 用于度量两个变量X和Y之间的相关(线性相关),其值介于-1与1之间。 在自然科学领域中,该系数广泛用于度量两个变量之间的相关程度。 0.8-1.0 极强相关 0.6-0.8 强相关 0.4-0.6 中等程度相关 0.2-0.4 弱相关 0.0-0.2 极弱相关或无相关 ‘‘‘ sum_xy, sum_x, sum_y, sum_x2, sum_y2, n = 0, 0, 0, 0, 0, 0 for key in rating1: if key in rating2: n = n + 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += x ** 2 sum_y2 += y ** 2 if n == 0: return 0 fenmu = sqrt(sum_x2 - (sum_x ** 2) / n) * sqrt(sum_y2 - (sum_y ** 2) / n) if fenmu == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / fenmu def computeNearesNeighbor(self, username): ‘‘‘ 计算关系系数 ‘‘‘ distinces = [] for instance in self.data: if instance != username: #相关系数 distince = self.fn(self.data[username], self.data[instance]) distinces.append((instance, distince)) distinces.sort(key=lambda artistTuple: artistTuple[1], reverse=True) return distinces def recommend(self, user): recommendations = {} nearest = self.computeNearesNeighbor(user) userRating = self.data[user] totalDistance = 0.0 for i in range(self.k): totalDistance += nearest[i][1] for i in range(self.k): weight = nearest[i][1] / totalDistance name = nearest[i][0] neighborRatings = self.data[name] #遍历相关性高的用户喜欢的书籍 for artist in neighborRatings: #如果喜欢的书不在推荐用户的书籍中 if not artist in userRating: #文章是否存在评级 if artist not in recommendations: recommendations[artist] = (neighborRatings[artist] * weight) else: recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight) recommendations = list(recommendations.items()) recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations] recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse=True) return recommendations[:self.n] def convertProductID2name(self, id): ‘‘‘ 给定商品编号返回商品名称 ‘‘‘ if id in self.productid2name: return self.productid2name[id] else: return id def userRatings(self, id, n): ‘‘‘ 返回前n条的与用户id相关的 :param id: :param n: :return: ‘‘‘ print("Ratings for " + self.userid2name[id]) ratings = self.data[id] print(len(ratings)) ratings = list(ratings.items()) ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings] ratings.sort(key=lambda artistTuple: artistTuple[1], reverse=True) ratings = ratings[:n] for rating in ratings: print("%s\t%i" % (rating[0], rating[1])) if __name__ == ‘__main__‘: r = recommender(users) print(r.recommend(‘Veronica‘)) r.loadBookDB(u‘D:/360安全浏览器下载/BX-CSV-Dump‘) print(r.recommend(‘276737‘))
#result:
[(‘Blues Traveler‘, 5.0)] 1700021 [(u"Devil‘s Waltz (Alex Delaware Novels (Paperback))byJonathan Kellerman", 9.0), (u‘Silent Partner (Alex Delaware Novels (Paperback))byJonathan Kellerman‘, 8.0), (u‘The Outsiders (Now in Speak!)byS. E. Hinton‘, 8.0), (u‘Sein LanguagebyJERRY SEINFELD‘, 8.0), (u‘The Girl Who Loved Tom GordonbyStephen King‘, 8.0)]
以上是关于数据挖掘之协同过滤的主要内容,如果未能解决你的问题,请参考以下文章