python_movie_apriori

丌官嘉福
2023-12-01
#! /usr/bin/env python
#coding=utf-8
import pandas as pd
root="F:/Data/exe/ml-100k/"
all_ratings=pd.read_csv(root+"u.data",delimiter="\t",header=None)#原始数据是制表符且没有表头
all_ratings.columns=(["UserId","MovieId","Rating","DateTime"])#Rating:用户给该电影打分，满分为5分
all_ratings["DateTime"]=pd.to_datetime(all_ratings["DateTime"],unit="s")#解析时间戳数据
all_ratings["Favorable"]=all_ratings["Rating"]>3#增加用户是否喜欢这部电影的特征
ratings=all_ratings[all_ratings["UserId"].isin(range(200))]
#前200个不同的用户的数据，而不是前200条数据,共有一万多条数据
#目标：如果用户喜欢某些电影，那么他们也将喜欢这部电影
favorable_ratings=ratings[ratings["Favorable"]]#前200个用户喜欢某部电影的数据行
#按照用户分组，frozenset(v.values)存储每个用户喜爱的电影
favorable_reviews_by_users=dict((k,frozenset(v.values))
                                for k,v in favorable_ratings.groupby("UserId")["MovieId"])
#按电影分组，记录每个电影的评分总和
num_favorable_by_movie=ratings[["MovieId","Favorable"]].groupby("MovieId").sum()
#查看最受欢迎的5部电影
#print (num_favorable_by_movie.sort("Favorable",ascending=False)[:5])
#%%%%%%%%%%%%%%%%%%%%%%%%Apriori%%%%%%%%%%%%%%%
frequent_itemsets={}
min_support=50
#电影编号使用frozenset，原因：一当做集合，二当做字典的键（普通的集合不可以
frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"])
                          for movie_id,row in num_favorable_by_movie.iterrows()
                          if row["Favorable"]>min_support)

print(frequent_itemsets[1])
from collections import defaultdict
#函数：定义新发现的频繁集合，创建超集，检测频繁程度

def find_frequent_itemsets(favorable_reviews_by_users,k_1_itermsets,min_support):
    counts=defaultdict(int)#字典初始化
    for user,reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itermsets:#遍历前面找出来的项集
            if itemset.issubset(reviews):
                #判断是否是用户当前评分项集中的子集，如果是表示用户已经为子集中的电影评过分
                for other_reviewed_movie in reviews-itemset:
                    #遍历用户评过分却没有出现在项集中的电影,用它生成超集，更新该项集的计数
                    current_superset=itemset|frozenset((other_reviewed_movie,))
                    counts[current_superset]+=1
    #返回其中的频繁项集
    return dict([(itemset,frequent)for itemset,frequent in counts.items() if frequent>=min_support])
import sys
for k in range(2,20):
    cur_frequent_itemsets=\
        find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
    frequent_itemsets[k]=cur_frequent_itemsets
    #未找到新的频繁集
    if len(cur_frequent_itemsets)==0:
        print("Did not find any frequent itemsets of lenth{}".format(k))
        sys.stdout.flush()
        break
    #找到新的频繁集
    else:
        print("I find {} frequent itemsets of lenth {}".format(len(cur_frequent_itemsets),k))
        sys.stdout.flush()
del frequent_itemsets[1]
#关联规则：如果用户喜欢前提中的所有电影，那么他也会喜欢结论中的电影
candidate_rules=[]
for itemset_length,itemset_counts in frequent_itemsets.items():
    #frequent_itemsets:字典 itemset_length：键：项集长度 itemset_counts：值：同一项集长度下的多个项集
    for itemset in itemset_counts.keys():
        #itemset_counts：字典（键：电影id,值电影的喜爱度） itemset：一个项集的多个电影id
        for conclusion in itemset:
            premise=itemset-set((conclusion,))#premise前提可能有很多，conclusion是一个
            candidate_rules.append((premise,conclusion))
print(candidate_rules[:5])
#计算规则的置信度
correct_counts=defaultdict(int)#就是字典的值是int型的,键不管，无论初始化默认任何键值都为0
incorrect_counts=defaultdict(int)
for user,reviews in favorable_reviews_by_users.items():
    #遍历所有用户及其喜爱的电影
    for candidate_rule in candidate_rules:
        premise,conclusion=candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule]+=1#对于一个用户，是正确规则则加1，否则为错误规则加1
            else:
                incorrect_counts[candidate_rule]+=1
#计算每条适用规则的置信度,也是定义了一个字典
rule_confidence={candidate_rule:correct_counts[candidate_rule]/
                                float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])
                                for candidate_rule in candidate_rules}

root="F:/Data/exe/ml-100k/"
movie_name_data=pd.read_csv(root+"u.item",delimiter="|",header=None,encoding="mac-roman")
movie_name_data.columns=["Movie_Id","Title","Release Date",
                         "Video Release","IMDB","<UNK>",
                         "Action","Adventure","Animation",
                         "Children","Comedy","Crim",
                         "Documentary","Drama","Fantasy",
                         "Film-Noir","Horror","Musical",
                         "Mystery","Romance","Sci-Fi",
                         "Thriller","War","Western"]
#用电影编号获得电影
def get_movie_name(movie_id):
    title_object=movie_name_data[movie_name_data["Movie_Id"]==movie_id]["Title"]
    title=title_object.values[0]#title_object得到的是series对象，我们只对第一个值（电影名称）感兴趣
    return title
#print(get_movie_name(1）
#获取置信度排序前五的规则
#获取置信度排序前五的规则
from operator import itemgetter
sorted_confidence=sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{}".format(index+1))
    (premise,conclusion)=sorted_confidence[index][0]
    premise_names=",".join(get_movie_name(idx)for idx in premise)
    conclusion_name=get_movie_name(conclusion)
    print("Rule:If a person recommends {0} they will also recommend {1}".format(premise_names,conclusion_name))
    print("-Confidence: {}".format(rule_confidence[(premise,conclusion)]))
    print("")
#训练集用了前200为用户，测试集用余下的数据，为测试集中每一位用户获取最喜欢的电影
test_dataset=all_ratings[~all_ratings["UserId"].isin(range(200))]#剩下用户的数据集
test_favorable=test_dataset[test_dataset["Favorable"]]#剩下用户喜爱的评分的数据集
test_favorable_by_users=dict((k,frozenset(v.values))
                             for k,v in test_favorable.groupby("UserId")["MovieId"])
                            #存储剩下每个用户的id以及他们喜欢的电影
#计算测试集中上述给的规则的应验数量
correct_counts=defaultdict(int)
incorrect_counts=defaultdict(int)
for user,reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise,conclusion=candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule]+=1
            else:
                incorrect_counts[candidate_rule]+=1
#计算应验规则的置信度
test_confidence={candidate_rule:
                correct_counts[candidate_rule]
                /float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])
                 for candidate_rule in candidate_rules}
for index in range(5):
    print("Rule {}".format(index+1))
    (premise,conclusion)=sorted_confidence[index][0]
    premise_names=",".join(get_movie_name(idx)for idx in premise)
    conclusion_name=get_movie_name(conclusion)
    print("Rule:If a person recommends {0} they will also recommend {1}"
          .format(premise_names,conclusion_name))
    print("-Train Confidence:{}").format(rule_confidence[(premise,conclusion)])
    print("-Test Confidence:{}").format(test_confidence[(premise,conclusion)])
F:\Amy\anaconnda\python.exe F:/Amy/3_python文件/exe/2_movies_apriori.py
{frozenset([286]): 59.0, frozenset([7]): 67.0, frozenset([64]): 58.0, frozenset([79]): 58.0, frozenset([258]): 83.0, frozenset([50]): 100.0, frozenset([313]): 60.0, frozenset([174]): 74.0, frozenset([100]): 89.0, frozenset([181]): 79.0, frozenset([1]): 66.0, frozenset([127]): 70.0, frozenset([172]): 59.0, frozenset([98]): 70.0, frozenset([56]): 67.0, frozenset([9]): 53.0}
I find 93 frequent itemsets of lenth 2
I find 295 frequent itemsets of lenth 3
I find 593 frequent itemsets of lenth 4
I find 785 frequent itemsets of lenth 5
I find 677 frequent itemsets of lenth 6
I find 373 frequent itemsets of lenth 7
I find 126 frequent itemsets of lenth 8
I find 24 frequent itemsets of lenth 9
I find 2 frequent itemsets of lenth 10
Did not find any frequent itemsets of lenth11
[(frozenset([50]), 64), (frozenset([64]), 50), (frozenset([127]), 181), (frozenset([181]), 127), (frozenset([127]), 1)]
Rule #1
Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977)
-Confidence: 1.0


Rule #2
Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)
-Confidence: 1.0


Rule #3
Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)
-Confidence: 1.0


Rule #4
Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994)
-Confidence: 1.0


Rule #5
Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977)
-Confidence: 1.0


Rule 1
Rule:If a person recommends Pulp Fiction (1994),Contact (1997),Empire Strikes Back, The (1980),Return of the Jedi (1983),Twelve Monkeys (1995) they will also recommend Star Wars (1977)
-Train Confidence:1.0
-Test Confidence:0.965517241379
Rule 2
Rule:If a person recommends Silence of the Lambs, The (1991),Godfather, The (1972),Empire Strikes Back, The (1980),Raiders of the Lost Ark (1981),Twelve Monkeys (1995) they will also recommend Shawshank Redemption, The (1994)
-Train Confidence:1.0
-Test Confidence:0.853658536585
Rule 3
Rule:If a person recommends Pulp Fiction (1994),Toy Story (1995),Shawshank Redemption, The (1994),Godfather, The (1972) they will also recommend Silence of the Lambs, The (1991)
-Train Confidence:1.0
-Test Confidence:0.869565217391
Rule 4
Rule:If a person recommends Shawshank Redemption, The (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Fugitive, The (1993) they will also recommend Pulp Fiction (1994)
-Train Confidence:1.0
-Test Confidence:0.755555555556
Rule 5
Rule:If a person recommends Pulp Fiction (1994),Fargo (1996),Return of the Jedi (1983),Raiders of the Lost Ark (1981),Godfather, The (1972) they will also recommend Star Wars (1977)
-Train Confidence:1.0
-Test Confidence:0.975


进程已结束,退出代码0
python_movie_apriori

相关阅读

相关文章

相关问答