每日分享:
如果你累了,学会休息,而不是放弃
目标:提取贴吧中每个帖子的标题和链接(内有自动翻页)
以下是爬取数据的源码:
import requests
from lxml import etree
class TieBa(object):
def __init__(self, name):
self.url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0'.format(name)
self.headers = {
'User-Agent': '改为你的user-agent'
}
def get_data(self, url):
response = requests.get(url, headers=self.headers)
# with open('ttt.html', 'wb')as f:
# f.write(response.content)
return response.content
def parse_data(self, data):
# 去掉爬取内容中的注释(许多代码被注释了)
data = data.decode().replace('<!--', '').replace('-->', '')
# 转化成element对象
html = etree.HTML(data)
# 用xpath方法提取每个帖子
el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
data_list = []
for el in el_list:
tmp = {}
# 在提取到的帖子中提取标题
tmp['title'] = el.xpath('./text()')[0]
# 在提取到的帖子中提取链接
tmp['link'] = 'https://tieba.baidu.com/' + el.xpath('./@href')[0]
data_list.append(tmp)
# 获取下一页
try:
next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
except:
next_url = None
return data_list, next_url
def save_data(self, data_list):
print(data_list)
def run(self):
# url
# headers
# 发送请求,获取响应
next_url = self.url
while True:
data = self.get_data(next_url)
# 从响应中提取数据(数据和翻页用的url)
data_list, next_url = self.parse_data(data)
self.save_data(data_list)
print(next_url)
# 判断是否终结
if next_url is None:
break
if __name__ == '__main__':
tieba = TieBa('林夕')
tieba.run()