爬虫中数据提取-百度贴吧

南宫松
2023-12-01

每日分享:

如果你累了,学会休息,而不是放弃

目标:提取贴吧中每个帖子的标题和链接(内有自动翻页)

以下是爬取数据的源码:

import requests
from lxml import etree


class TieBa(object):

    def __init__(self, name):
        self.url = 'https://tieba.baidu.com/f?kw={}&ie=utf-8&pn=0'.format(name)
        self.headers = {
            'User-Agent': '改为你的user-agent'
        }

    def get_data(self, url):
        response = requests.get(url, headers=self.headers)
        # with open('ttt.html', 'wb')as f:
        #     f.write(response.content)
        return response.content

    def parse_data(self, data):
        # 去掉爬取内容中的注释(许多代码被注释了)
        data = data.decode().replace('<!--', '').replace('-->', '')
        # 转化成element对象
        html = etree.HTML(data)
        # 用xpath方法提取每个帖子
        el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        data_list = []
        for el in el_list:
            tmp = {}
            # 在提取到的帖子中提取标题
            tmp['title'] = el.xpath('./text()')[0]
            # 在提取到的帖子中提取链接
            tmp['link'] = 'https://tieba.baidu.com/' + el.xpath('./@href')[0]
            data_list.append(tmp)
        # 获取下一页
        try:
            next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
        except:
            next_url = None
        return data_list, next_url

    def save_data(self, data_list):
        print(data_list)

    def run(self):
        # url
        # headers
        # 发送请求,获取响应
        next_url = self.url
        while True:
            data = self.get_data(next_url)
            # 从响应中提取数据(数据和翻页用的url)
            data_list, next_url = self.parse_data(data)
            self.save_data(data_list)
            print(next_url)
            # 判断是否终结
            if next_url is None:
                break


if __name__ == '__main__':
    tieba = TieBa('林夕')
    tieba.run()
 类似资料: