@@ -10,9 +10,11 @@ class UserBasedCF:
1010 基于用户的协同过滤算法
1111 """
1212 def __init__ (self , datafile = None ):
13- self .user_sim_best = {} # 优化后的用户相似度集合
14- self .user_sim_cos = {} # 用户相似度集合
15- self .user_sim_jaccard = {} # 用户相似度集合
13+
14+ self .user_sim_cos = {} # 用户相似度集合 - 基于余弦相似度
15+ self .user_sim_jaccard = {} # 用户相似度集合 - 基于杰卡德相似系数
16+ self .user_sim_cos_best = {} # 优化后的用户相似度集合 - 基于余弦相似度
17+ self .user_sim_jaccard_best = {} # 优化后的用户相似度集合 - 基于杰卡德相似系数
1618 self .train_data = {} # 用户-物品的评分表
1719 self .test_data = {} # 测试集
1820 self .data = []
@@ -26,7 +28,8 @@ def read_data(self, datafile=None):
2628 """
2729 self .datafile = datafile or self .datafile
2830 for line in open (self .datafile ):
29- user_id , item_id , record , _ = line .split ()
31+ # user_id, item_id, record, _ = line.split()
32+ user_id , item_id , record = line .split ()[:3 ] # 兼容仅3列数据的数据集
3033 self .data .append ((user_id , item_id , int (record )))
3134
3235 def split_data (self , k , seed , data = None , m = 8 ):
@@ -73,12 +76,15 @@ def user_similarity_cos(self, train=None):
7376 continue
7477 self .user_sim_cos .setdefault (u , {})
7578 items = set (train [u ].keys ()) & set (train [v ].keys ())
76- sum_of_products = sum ([train [u ].get (item ) * train [v ].get (item ) for item in items ])
77- sq_u = math .sqrt (sum ([pow (score , 2 ) for score in train [u ].values ()]))
78- sq_v = math .sqrt (sum ([pow (score , 2 ) for score in train [v ].values ()]))
79- self .user_sim_cos [u ][v ] = float (sum_of_products ) / (sq_u * sq_v )
79+ if not items :
80+ self .user_sim_cos [u ][v ] = 0.0
81+ else :
82+ sum_of_products = sum ([train [u ].get (item ) * train [v ].get (item ) for item in items ])
83+ sq_u = math .sqrt (sum ([pow (score , 2 ) for score in train [u ].values ()]))
84+ sq_v = math .sqrt (sum ([pow (score , 2 ) for score in train [v ].values ()]))
85+ self .user_sim_cos [u ][v ] = float (sum_of_products ) / (sq_u * sq_v )
8086
81- def user_similarity_best (self , train = None ):
87+ def user_similarity_jaccard_best (self , train = None ):
8288 """
8389 用户相似度矩阵 - 基于杰卡德相似系数(优化)
8490 提高稀疏矩阵的运算效率
@@ -104,9 +110,9 @@ def user_similarity_best(self, train=None):
104110 count [u ].setdefault (v , 0 )
105111 count [u ][v ] += 1
106112 for u , related_users in count .items ():
107- self .user_sim_best .setdefault (u , dict ())
113+ self .user_sim_jaccard_best .setdefault (u , dict ())
108114 for v , cuv in related_users .items ():
109- self .user_sim_best [u ][v ] = cuv / math .sqrt (user_item_count [u ] * user_item_count [v ] * 1.0 )
115+ self .user_sim_jaccard_best [u ][v ] = cuv / math .sqrt (user_item_count [u ] * user_item_count [v ] * 1.0 )
110116
111117 def recommend (self , user , train = None , k = 8 , n_item = 40 ):
112118 """
@@ -188,7 +194,7 @@ def test_user_based_cf():
188194 测试基于用户的协同过滤不同k值下的推荐评测指标
189195 """
190196 ub_cf = UserBasedCF ('u.data' )
191- ub_cf .user_similarity_best ()
197+ ub_cf .user_similarity_cos ()
192198 print "%3s%20s%20s%20s%20s" % ('K' , 'recall' , 'precision' , 'coverage' , 'popularity' )
193199 for k in [5 , 10 , 20 , 40 , 80 , 160 ]:
194200 recall , precision = ub_cf .recall_and_precision (k = k )
@@ -201,17 +207,24 @@ def test_recommend():
201207 """
202208 通过测试集合分别测试不同K值的推荐情况
203209 """
210+ import time
211+ start_time = time .time ()
204212 ub_cf = UserBasedCF ('u.data' )
205213 ub_cf .user_similarity_cos ()
214+ train_time = time .time ()
215+ print '训练耗时:%sS' % (train_time - start_time )
206216 user = '345'
207217 for k in [5 , 10 , 20 , 40 , 80 , 160 ]:
218+ rec_start = time .time ()
208219 rank = ub_cf .recommend (user , train = None , k = k , n_item = 5 )
220+ rec_end = time .time ()
209221 print "%s [user=%5s K=%3s] %s" % ('-' * 12 , user , k , '-' * 12 )
210222 print "%5s%20s%20s" % ('item' , 'similarity' , 'record' )
211223 for i , rvi in rank .items ():
212224 items = ub_cf .test_data .get (user , {})
213225 record = items .get (i , 0 )
214226 print "%5s%20.4f%20.4f" % (i , rvi , record )
227+ print '[k=%3s]推荐耗时:%sS' % (k , rec_end - rec_start )
215228
216229
217230if __name__ == "__main__" :
0 commit comments