BeautifulSoup方法提取网页内容,并且保存到csv和excel中

薄腾
2023-12-01

纯爬虫框架

1: 设置头部文件(浏览器头部,代理IP)

#浏览器头部
USER_AGENT = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6:
    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",

]
# 代理ip
IPS=[{'HTTPS': '106.75.226.36:808', 'HTTP': '61.135.217.7:80'},
 {'HTTPS': '106.75.164.15:3128', 'HTTP': '118.190.95.35:9001'},
 {'HTTPS': '101.204.70.51:808', 'HTTP': '124.235.181.175:80'},
 {'HTTPS': '110.73.44.89:8123', 'HTTP': '110.73.6.70:8123'},
 {'HTTPS': '182.88.179.108:8123', 'HTTP': '110.73.0.121:8123'},
 {'HTTPS': '106.75.164.15:3128', 'HTTP': '61.135.217.7:80'},
 {'HTTPS': '106.75.226.36:808', 'HTTP': '222.94.145.158:808'},
 {'HTTPS': '121.31.192.106:8123', 'HTTP': '118.190.95.35:9001'},
 {'HTTPS': '106.75.164.15:3128', 'HTTP': '124.235.181.175:80'}
 ]

2:保存文件(分为csv和eacel两种方式)

  • 1:存入csv文件
# 导包
import os
from openpyxl import Workbook
import csv
#创建保存类
class Saving(object):
    # 保存到csv文件的函数
    def savetoCsv(self,datalist,filename):
        # 使用a也就是append。实现的是每次都是添加
        with open(filename,'a',encoding='utf-8',newline='')as csvfile:
            write = csv.writer(csvfile)
            for each in datalist:
                write.writerow(each)
        print('储存完成')
    #保存到excel的文件
    def savetoExcel(self,data,filename,workbook,sheet):
        for each in data:
            sheet.append(each)
        workbook.save(filename)
        print(filename,'存储完成')

3:提取信息以及保存代码

  • 1:这个是纯框架,在使用时,只需要添加网页内容提取有用信息的部分就可以了。
  • 2:在主函数给参数的时候,第一个参数给文件扩展名为.csv就是保存为csv文件。给的文件扩展名为.xlsx那么保存得就是excel文件
# 导包
import requests
from bs4 import BeautifulSoup
import random
from savefile import Saving
from setting import USER_AGENT,IPS
import chardet
import re
import time
import csv
from openpyxl import Workbook

counts=1
# 获取网页内容(添加头部和代理IP)
def gethtml(url):
    # 使用一个counts计数,当获取到达20次的时候提示一下,这个时候说明代理IP不能使用了,需要重新获取
    global counts
    # try一下当一次没有获取到的时候再获取几次
    try:
        response=requests.get(
            url,
            headers={'User-Agent':random.choice(USER_AGENT)},
            proxies=random.choice(IPS)
        )
        #指定编码
        response.encoding=chardet.detect(response.content)['encoding']
        return response.text
    except:
        counts+=1
        if counts>20:
            print('老铁,你的ip真的没法用了')
            return
        else:
            # 提示一下,是第几次获取
            print('第',counts,'次重新抓取')
            gethtml(url)

# 提取数据
def getData(html,filename,workbook,sheet):
    # data是保存我们当前页的所有的详情页的超链接
    data=[]
    #得到beautifulsoup对象
    soup=BeautifulSoup(html,'html.parser')
    # 开始豆瓣电影中的详情页链接提取
    '''
    # 一个小案例提示
    divs = soup.find_all('div', class_='item')
    for i, each in enumerate(divs):
        href = each.find('div', class_='pic').find('a').attrs['href']
        data.append(href)
    '''
    #这里data保存得是详情页面的链接
    return data
def getdata1(data,filename,workbook,sheet):
    # 使用counts计数一下,当前是第几个详情页
    counts1=0
    for each in data:
        counts1+=1
        data1=[]
        # 防止被反扒,每打开一个详情页,就休眠几秒钟(尽量长一点,这样被反扒的几率就越小)
        time.sleep(2)
        # 获取网页内容
        html=gethtml(each)
        # 获取当前详情页的beautifulsoup对象
        soup1 = BeautifulSoup(html, 'html.parser')
        # 开始提取网页信息

        data1.append([])
        # 没提取一条就保存一条
        save(data1,filename,workbook,sheet)
        # 输出提示一下,当前为第几个详情页面
        print("保存第"+str(counts1)+"页的详情页成功")
# 创建自定义保存函数
def save(data,filename,workbook,sheet):
    # 实例化对象,且 调用其方法
    mysave=Saving()
    # 这个时候需要判断是保存到excel还是csv
    if 'xlsx' in filename:
        mysave.savetoExcel(data,filename,workbook,sheet)
    elif 'csv' in filename:
        mysave.savetoCsv(data,filename)
    else:
        pass
# 自定义创建主函数
def main(filename,workbook,sheet):
    # 因为头标题是就保存一次的所以放在最上面最先保存
    alldata=[]
    alldata.append(['','','','','','','','','','',''])
    if 'xlsx' in filename:
        save(alldata,filename,workbook,sheet)
    elif 'csv' in filename:
        save(alldata,filename,workbook,sheet)
    # 这里是使用总的网页的第几页之间的链接数字来爬取多页的
    for i in range(1):
        time.sleep(3)
        # 提示下当前为第几页
        print('正在抓取第',i+1,'页')
        # 中间是数字这个时候需要str转换一下,因为+只能在一种类型中进行
        url = ''+str(i)+''
        html=gethtml(url)
        data=getData(html,filename,workbook,sheet)
        getdata1(data, filename, workbook, sheet)

# 判断一下是不是调用的主函数。防止调用出错
if __name__=='__main__':
    # 当保存为excel时这个时候,workbook工作簿,以及工作表都应该只创建一次,并且通过主函数依次传入保存到excel函数中
    workbook = Workbook()
    sheet = workbook.active
    # 工作表的标题选择性给
    sheet.title=""
    # 此时第一个参数给的文件扩展名为 .csv 那么保存得就是csv文件。
    # 此时第一个参数给的文件扩展名为 .xlsx 那么保存得就是excel文件。
    main('',workbook,sheet)

4:使用框架爬取豆瓣Top250(不提取详情页)案例代码(仅供参考使用)

import requests
from bs4 import BeautifulSoup
import random
from savefile import Saving
from setting import USER_AGENT,IPS
import chardet
import re
import time
import csv
from openpyxl import Workbook

counts=1
def gethtml(url):
    global counts
    try:
        response=requests.get(
            url,
            headers={'User-Agent':random.choice(USER_AGENT)},
            proxies=random.choice(IPS)
        )
        #指定编码
        response.encoding=chardet.detect(response.content)['encoding']
        return response.text
    except:
        counts+=1
        if counts>20:
            print('老铁,你的ip真的没法用了')
            return
        else:
            print('第',counts,'次重新抓取')
            gethtml(url)

# 提取数据
def getData(html,filename,workbook,sheet):
    data=[]
    soup=BeautifulSoup(html,'html.parser')
    # 开始豆瓣电影中的信息提取
    divs=soup.find_all('div',class_='item')
    for i,each in enumerate(divs):
        #排名
        rank=each.find('div',class_='pic').find('em').get_text()
        # 电影链接
        # href=each.find('div',class_='pic').find('a')['href']  #上下这两种方式都是可以的
        href = each.find('div', class_='pic').find('a').attrs['href']
        # 电影名称
        filmname=each.find('div',class_='hd').find_all('span')[0].get_text()
        # 导演
        # 先获取整个字符串
        wholetext=each.find('div',class_='bd').find('p').get_text()
        # print(wholetext)
        # 这次的正则有点6,之前提取两边唯一,这次两边可以多选
        reg=re.compile('导演:(.*?)(\\xa0|/)')
        director=re.findall(reg,wholetext)[0][0].strip()
        # 主演.没有就去详情页里面爬取
        reg1=re.compile('主演:(.*?)(/|\.)')
        if re.findall(reg1,wholetext):
            role=re.findall(reg1,wholetext)[0][0].strip()
        else:
            relo=''
        # 上映日期
        year_reg=re.compile('\d{4}')
        year=re.findall(year_reg,wholetext)[0]
        # 国家地区
        # country_reg=re.compile('\\xa0(.*?)')
        # country=re.findall(country_reg,wholetext)[0]
        country=wholetext.split('\n')[2].split('/')[-2].strip()

        #电影类型
        filmtstyle=wholetext.split('\n')[2].strip().split('/')[-1]
        # 评分
        score=each.find('div',class_='star').find_all('span')[1].get_text()
        # 评论人数
        people=each.find('div',class_='star').find_all('span')[3].get_text()
        # 简评
        quote=each.find('p',class_='quote').get_text()
        data.append([rank,href,filmname,director,role,year,country,filmtstyle,score,people,quote])
    save(data,filename,workbook,sheet)

def save(data,filename,workbook,sheet):
    # 实例化对象,且 调用其方法
    mysave=Saving()
    if 'xlsx' in filename:
        mysave.savetoExcel(data,filename,workbook,sheet)
    elif 'csv' in filename:
        mysave.savetoCsv(data,filename)
    else:
        pass
def main(filename,workbook,sheet):
    alldata=[]
    alldata.append(['排名','链接','电影名','导演','主演','上映时间','国家地区','电影类型','评分','评论人数','简评'])
    if 'xlsx' in filename:
        save(alldata,filename,workbook,sheet)
    elif 'csv' in filename:
        save(alldata,filename,workbook,sheet)
    for i in range(4):
        time.sleep(3)
        print('正在抓取第',i+1,'页')
        url = 'https://movie.douban.com/top250?start='+str(i*25)+'&filter='
        html=gethtml(url)
        getData(html,filename,workbook,sheet)

if __name__=='__main__':
    workbook = Workbook()
    sheet = workbook.active
    sheet.title="豆瓣"
    main('douban.csv',workbook,sheet)
 类似资料: