1 importrequests2 from lxml importetree3 from urllib importrequest4 importjson5
6 #全局变量(请求头+文件io对象)
7 headers ={8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'}9 file = open('./斗鱼.txt', 'w', encoding='utf-8')10
11
12 #采集前端源码
13 defindex():14 for num in range(1, 21):15 base_url = 'https://www.douyu.com/gapi/rkc/directory/mixList/2_181/{}'.format(num) #翻页
16 print('正在写入', base_url, '中的数据信息...')17 response = requests.get(base_url, headers=headers)18 response.encoding = 'uft-8' #解码
19 jsons =response.text20 #print(type(jsons))#jsons的数据类型是str
21 clean(jsons) #清洗数据函数
22
23
24 #清洗数据
25 defclean(jsons):26 dicts = json.loads(jsons) #将jsons的数据类型由字符型转换成字典型
27 #print(dicts)
28 info_list = dicts['data']['rl'] #提取主要信息
29 printt(info_list)30
31
32 #打印数据信息
33 defprintt(info_list):34 for i ininfo_list:35 room_number = i['rn']36 #print(room_number)
37 homeowner = i['nn']38 #print(homeowner)
39 heat = i['ol']40 #print(heat)
41 C2name = i['c2name']42 #整合数据信息
43 full_info = C2name + '房间号:' + room_number + '\t' + '房主:' + homeowner + '\t' + '热度:' +str(heat)44 #写入文件
45 file.write(full_info + '\n')46
47
48 if __name__ == '__main__':49 index()50 file.close()