# -*- coding: utf-8 -*-
"""
Created on Mon May 07 11:05:49 2018
B站爬虫:
功能: 1 获取评论人数,点赞数,用户个人信息及评论等等
@author: Alis
"""
import re,time
import requests
import os
import json
headers = {'user-agents':'User-Agent:Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
def getavID(pn):
avID = []
for i in range(1,pn+1):
url = 'https://api.bilibili.com/x/web-interface/dynamic/region?&jsonp=jsonp&pn=%d&ps=50&rid=24&_=1525679623909'%i
r = requests.get(url,headers = headers).text
data = json.loads(r)
archives = data['data']['archives']
for ac in archives:
avID.append(ac['aid'])
print 'aid: ',ac['aid']
print 'title: ',ac['title']
print 'attribute: ',ac['attribute']
time.sleep(2)
return avID
def getHTMLText(i,num = 2):
for n in range(1,num):
url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(i)+"&sort=0&_=1496477384198"
r = requests.get(url,headers = headers)
text = r.text
print("正在打印第"+str(n)+"页评论!")
num = printTXT(text)
if num > 1:
break
for n in range(2,num+1):
url = "https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn="+str(n)+"&type=1&oid="+str(i)+"&sort=0&_=1496477384198"
r = requests.get(url,headers = headers)
text = r.text
print("正在打印第"+str(n)+"页评论!")
printTXT(text)
time.sleep(1)
def printTXT(text):
data = json.loads(text)
reply = data['data']['replies']
for t in reply:
print u"用户昵称 :",t['member']['uname'],u'性别',t['member']['sex']
print u'评论信息',t['content']['message']
if len(t['replies']) > 0:
for t2 in t['replies']:
print u"用户昵称 :",t2['member']['uname'],u'性别',t2['member']['sex']
print u'评论信息',t2['content']['message']
pageNum = data['data']['page']['count'] / 20 + 1 # 评论页数
return pageNum
if __name__ == "__main__":
#i = input(u"请输入av号(数字):")
begin = time.clock()
avid = getavID(1)
map(getHTMLText,avid)
end = time.clock()
print 'cost time is: ',round(end-begin,3),'s'