X Tutup
Skip to content

Commit fe83f26

Browse files
committed
更新协同过滤算法
1 parent 16f3fa5 commit fe83f26

File tree

1 file changed

+25
-12
lines changed

1 file changed

+25
-12
lines changed

test/recsys_cf/user_based_cf.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ class UserBasedCF:
1010
基于用户的协同过滤算法
1111
"""
1212
def __init__(self, datafile=None):
13-
self.user_sim_best = {} # 优化后的用户相似度集合
14-
self.user_sim_cos = {} # 用户相似度集合
15-
self.user_sim_jaccard = {} # 用户相似度集合
13+
14+
self.user_sim_cos = {} # 用户相似度集合 - 基于余弦相似度
15+
self.user_sim_jaccard = {} # 用户相似度集合 - 基于杰卡德相似系数
16+
self.user_sim_cos_best = {} # 优化后的用户相似度集合 - 基于余弦相似度
17+
self.user_sim_jaccard_best = {} # 优化后的用户相似度集合 - 基于杰卡德相似系数
1618
self.train_data = {} # 用户-物品的评分表
1719
self.test_data = {} # 测试集
1820
self.data = []
@@ -26,7 +28,8 @@ def read_data(self, datafile=None):
2628
"""
2729
self.datafile = datafile or self.datafile
2830
for line in open(self.datafile):
29-
user_id, item_id, record, _ = line.split()
31+
# user_id, item_id, record, _ = line.split()
32+
user_id, item_id, record = line.split()[:3] # 兼容仅3列数据的数据集
3033
self.data.append((user_id, item_id, int(record)))
3134

3235
def split_data(self, k, seed, data=None, m=8):
@@ -73,12 +76,15 @@ def user_similarity_cos(self, train=None):
7376
continue
7477
self.user_sim_cos.setdefault(u, {})
7578
items = set(train[u].keys()) & set(train[v].keys())
76-
sum_of_products = sum([train[u].get(item) * train[v].get(item) for item in items])
77-
sq_u = math.sqrt(sum([pow(score, 2) for score in train[u].values()]))
78-
sq_v = math.sqrt(sum([pow(score, 2) for score in train[v].values()]))
79-
self.user_sim_cos[u][v] = float(sum_of_products) / (sq_u * sq_v)
79+
if not items:
80+
self.user_sim_cos[u][v] = 0.0
81+
else:
82+
sum_of_products = sum([train[u].get(item) * train[v].get(item) for item in items])
83+
sq_u = math.sqrt(sum([pow(score, 2) for score in train[u].values()]))
84+
sq_v = math.sqrt(sum([pow(score, 2) for score in train[v].values()]))
85+
self.user_sim_cos[u][v] = float(sum_of_products) / (sq_u * sq_v)
8086

81-
def user_similarity_best(self, train=None):
87+
def user_similarity_jaccard_best(self, train=None):
8288
"""
8389
用户相似度矩阵 - 基于杰卡德相似系数(优化)
8490
提高稀疏矩阵的运算效率
@@ -104,9 +110,9 @@ def user_similarity_best(self, train=None):
104110
count[u].setdefault(v, 0)
105111
count[u][v] += 1
106112
for u, related_users in count.items():
107-
self.user_sim_best.setdefault(u, dict())
113+
self.user_sim_jaccard_best.setdefault(u, dict())
108114
for v, cuv in related_users.items():
109-
self.user_sim_best[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)
115+
self.user_sim_jaccard_best[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)
110116

111117
def recommend(self, user, train=None, k=8, n_item=40):
112118
"""
@@ -188,7 +194,7 @@ def test_user_based_cf():
188194
测试基于用户的协同过滤不同k值下的推荐评测指标
189195
"""
190196
ub_cf = UserBasedCF('u.data')
191-
ub_cf.user_similarity_best()
197+
ub_cf.user_similarity_cos()
192198
print "%3s%20s%20s%20s%20s" % ('K', 'recall', 'precision', 'coverage', 'popularity')
193199
for k in [5, 10, 20, 40, 80, 160]:
194200
recall, precision = ub_cf.recall_and_precision(k=k)
@@ -201,17 +207,24 @@ def test_recommend():
201207
"""
202208
通过测试集合分别测试不同K值的推荐情况
203209
"""
210+
import time
211+
start_time = time.time()
204212
ub_cf = UserBasedCF('u.data')
205213
ub_cf.user_similarity_cos()
214+
train_time = time.time()
215+
print '训练耗时:%sS' % (train_time-start_time)
206216
user = '345'
207217
for k in [5, 10, 20, 40, 80, 160]:
218+
rec_start = time.time()
208219
rank = ub_cf.recommend(user, train=None, k=k, n_item=5)
220+
rec_end = time.time()
209221
print "%s [user=%5s K=%3s] %s" % ('-'*12, user, k, '-'*12)
210222
print "%5s%20s%20s" % ('item', 'similarity', 'record')
211223
for i, rvi in rank.items():
212224
items = ub_cf.test_data.get(user, {})
213225
record = items.get(i, 0)
214226
print "%5s%20.4f%20.4f" % (i, rvi, record)
227+
print '[k=%3s]推荐耗时:%sS' % (k, rec_end-rec_start)
215228

216229

217230
if __name__ == "__main__":

0 commit comments

Comments
 (0)
X Tutup