#! /usr/bin/env python #coding=utf-8 import pandas as pd root="F:/Data/exe/ml-100k/" all_ratings=pd.read_csv(root+"u.data",delimiter="\t",header=None)#原始数据是制表符且没有表头 all_ratings.columns=(["UserId","MovieId","Rating","DateTime"])#Rating:用户给该电影打分,满分为5分 all_ratings["DateTime"]=pd.to_datetime(all_ratings["DateTime"],unit="s")#解析时间戳数据 all_ratings["Favorable"]=all_ratings["Rating"]>3#增加用户是否喜欢这部电影的特征 ratings=all_ratings[all_ratings["UserId"].isin(range(200))] #前200个不同的用户的数据,而不是前200条数据,共有一万多条数据 #目标:如果用户喜欢某些电影,那么他们也将喜欢这部电影 favorable_ratings=ratings[ratings["Favorable"]]#前200个用户喜欢某部电影的数据行 #按照用户分组,frozenset(v.values)存储每个用户喜爱的电影 favorable_reviews_by_users=dict((k,frozenset(v.values)) for k,v in favorable_ratings.groupby("UserId")["MovieId"]) #按电影分组,记录每个电影的评分总和 num_favorable_by_movie=ratings[["MovieId","Favorable"]].groupby("MovieId").sum() #查看最受欢迎的5部电影 #print (num_favorable_by_movie.sort("Favorable",ascending=False)[:5]) #%%%%%%%%%%%%%%%%%%%%%%%%Apriori%%%%%%%%%%%%%%% frequent_itemsets={} min_support=50 #电影编号使用frozenset,原因:一当做集合,二当做字典的键(普通的集合不可以 frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"]) for movie_id,row in num_favorable_by_movie.iterrows() if row["Favorable"]>min_support) print(frequent_itemsets[1]) from collections import defaultdict #函数:定义新发现的频繁集合,创建超集,检测频繁程度 def find_frequent_itemsets(favorable_reviews_by_users,k_1_itermsets,min_support): counts=defaultdict(int)#字典初始化 for user,reviews in favorable_reviews_by_users.items(): for itemset in k_1_itermsets:#遍历前面找出来的项集 if itemset.issubset(reviews): #判断是否是用户当前评分项集中的子集,如果是表示用户已经为子集中的电影评过分 for other_reviewed_movie in reviews-itemset: #遍历用户评过分却没有出现在项集中的电影,用它生成超集,更新该项集的计数 current_superset=itemset|frozenset((other_reviewed_movie,)) counts[current_superset]+=1 #返回其中的频繁项集 return dict([(itemset,frequent)for itemset,frequent in counts.items() if frequent>=min_support]) import sys for k in range(2,20): cur_frequent_itemsets=\ find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support) frequent_itemsets[k]=cur_frequent_itemsets #未找到新的频繁集 if len(cur_frequent_itemsets)==0: print("Did not find any frequent itemsets of lenth{}".format(k)) sys.stdout.flush() break #找到新的频繁集 else: print("I find {} frequent itemsets of lenth {}".format(len(cur_frequent_itemsets),k)) sys.stdout.flush() del frequent_itemsets[1] #关联规则:如果用户喜欢前提中的所有电影,那么他也会喜欢结论中的电影 candidate_rules=[] for itemset_length,itemset_counts in frequent_itemsets.items(): #frequent_itemsets:字典 itemset_length:键:项集长度 itemset_counts:值:同一项集长度下的多个项集 for itemset in itemset_counts.keys(): #itemset_counts:字典(键:电影id,值电影的喜爱度) itemset:一个项集的多个电影id for conclusion in itemset: premise=itemset-set((conclusion,))#premise前提可能有很多,conclusion是一个 candidate_rules.append((premise,conclusion)) print(candidate_rules[:5]) #计算规则的置信度 correct_counts=defaultdict(int)#就是字典的值是int型的,键不管,无论初始化默认任何键值都为0 incorrect_counts=defaultdict(int) for user,reviews in favorable_reviews_by_users.items(): #遍历所有用户及其喜爱的电影 for candidate_rule in candidate_rules: premise,conclusion=candidate_rule if premise.issubset(reviews): if conclusion in reviews: correct_counts[candidate_rule]+=1#对于一个用户,是正确规则则加1,否则为错误规则加1 else: incorrect_counts[candidate_rule]+=1 #计算每条适用规则的置信度,也是定义了一个字典 rule_confidence={candidate_rule:correct_counts[candidate_rule]/ float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules} root="F:/Data/exe/ml-100k/" movie_name_data=pd.read_csv(root+"u.item",delimiter="|",header=None,encoding="mac-roman") movie_name_data.columns=["Movie_Id","Title","Release Date", "Video Release","IMDB","<UNK>", "Action","Adventure","Animation", "Children","Comedy","Crim", "Documentary","Drama","Fantasy", "Film-Noir","Horror","Musical", "Mystery","Romance","Sci-Fi", "Thriller","War","Western"] #用电影编号获得电影 def get_movie_name(movie_id): title_object=movie_name_data[movie_name_data["Movie_Id"]==movie_id]["Title"] title=title_object.values[0]#title_object得到的是series对象,我们只对第一个值(电影名称)感兴趣 return title #print(get_movie_name(1) #获取置信度排序前五的规则 #获取置信度排序前五的规则 from operator import itemgetter sorted_confidence=sorted(rule_confidence.items(), key=itemgetter(1), reverse=True) for index in range(5): print("Rule #{}".format(index+1)) (premise,conclusion)=sorted_confidence[index][0] premise_names=",".join(get_movie_name(idx)for idx in premise) conclusion_name=get_movie_name(conclusion) print("Rule:If a person recommends {0} they will also recommend {1}".format(premise_names,conclusion_name)) print("-Confidence: {}".format(rule_confidence[(premise,conclusion)])) print("") #训练集用了前200为用户,测试集用余下的数据,为测试集中每一位用户获取最喜欢的电影 test_dataset=all_ratings[~all_ratings["UserId"].isin(range(200))]#剩下用户的数据集 test_favorable=test_dataset[test_dataset["Favorable"]]#剩下用户喜爱的评分的数据集 test_favorable_by_users=dict((k,frozenset(v.values)) for k,v in test_favorable.groupby("UserId")["MovieId"]) #存储剩下每个用户的id以及他们喜欢的电影 #计算测试集中上述给的规则的应验数量 correct_counts=defaultdict(int) incorrect_counts=defaultdict(int) for user,reviews in test_favorable_by_users.items(): for candidate_rule in candidate_rules: premise,conclusion=candidate_rule if premise.issubset(reviews): if conclusion in reviews: correct_counts[candidate_rule]+=1 else: incorrect_counts[candidate_rule]+=1 #计算应验规则的置信度 test_confidence={candidate_rule: correct_counts[candidate_rule] /float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules} for index in range(5): print("Rule {}".format(index+1)) (premise,conclusion)=sorted_confidence[index][0] premise_names=",".join(get_movie_name(idx)for idx in premise) conclusion_name=get_movie_name(conclusion) print("Rule:If a person recommends {0} they will also recommend {1}" .format(premise_names,conclusion_name)) print("-Train Confidence:{}").format(rule_confidence[(premise,conclusion)]) print("-Test Confidence:{}").format(test_confidence[(premise,conclusion)])F:\Amy\anaconnda\python.exe F:/Amy/3_python文件/exe/2_movies_apriori.py {frozenset([286]): 59.0, frozenset([7]): 67.0, frozenset([64]): 58.0, frozenset([79]): 58.0, frozenset([258]): 83.0, frozenset([50]): 100.0, frozenset([313]): 60.0, frozenset([174]): 74.0, frozenset([100]): 89.0, frozenset([181]): 79.0, frozenset([1]): 66.0, frozenset([127]): 70.0, frozenset([172]): 59.0, frozenset([98]): 70.0, frozenset([56]): 67.0, frozenset([9]): 53.0} I find 93 frequent itemsets of lenth 2 I find 295 frequent itemsets of lenth 3 I find 593 frequent itemsets of lenth 4 I find 785 frequent itemsets of lenth 5 I find 677 frequent itemsets of lenth 6 I find 373 frequent itemsets of lenth 7 I find 126 frequent itemsets of lenth 8 I find 24 frequent itemsets of lenth 9 I find 2 frequent itemsets of lenth 10 Did not find any frequent itemsets of lenth11 [(frozenset([50]), 64), (frozenset([64]), 50), (frozenset([127]), 181), (frozenset([181]), 127), (frozenset([127]), 1)] Rule #1 Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977) -Confidence: 1.0 Rule #2 Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994) -Confidence: 1.0 Rule #3 Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991) -Confidence: 1.0 Rule #4 Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994) -Confidence: 1.0 Rule #5 Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977) -Confidence: 1.0 Rule 1 Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977) -Train Confidence:1.0 -Test Confidence:0.965517241379 Rule 2 Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994) -Train Confidence:1.0 -Test Confidence:0.853658536585 Rule 3 Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991) -Train Confidence:1.0 -Test Confidence:0.869565217391 Rule 4 Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994) -Train Confidence:1.0 -Test Confidence:0.755555555556 Rule 5 Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977) -Train Confidence:1.0 -Test Confidence:0.975 进程已结束,退出代码0