纯爬虫框架
1: 设置头部文件(浏览器头部,代理IP)
#浏览器头部
USER_AGENT = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
# 代理ip
IPS=[{'HTTPS': '106.75.226.36:808', 'HTTP': '61.135.217.7:80'},
{'HTTPS': '106.75.164.15:3128', 'HTTP': '118.190.95.35:9001'},
{'HTTPS': '101.204.70.51:808', 'HTTP': '124.235.181.175:80'},
{'HTTPS': '110.73.44.89:8123', 'HTTP': '110.73.6.70:8123'},
{'HTTPS': '182.88.179.108:8123', 'HTTP': '110.73.0.121:8123'},
{'HTTPS': '106.75.164.15:3128', 'HTTP': '61.135.217.7:80'},
{'HTTPS': '106.75.226.36:808', 'HTTP': '222.94.145.158:808'},
{'HTTPS': '121.31.192.106:8123', 'HTTP': '118.190.95.35:9001'},
{'HTTPS': '106.75.164.15:3128', 'HTTP': '124.235.181.175:80'}
]
2:保存文件(分为csv和eacel两种方式)
# 导包
import os
from openpyxl import Workbook
import csv
#创建保存类
class Saving(object):
# 保存到csv文件的函数
def savetoCsv(self,datalist,filename):
# 使用a也就是append。实现的是每次都是添加
with open(filename,'a',encoding='utf-8',newline='')as csvfile:
write = csv.writer(csvfile)
for each in datalist:
write.writerow(each)
print('储存完成')
#保存到excel的文件
def savetoExcel(self,data,filename,workbook,sheet):
for each in data:
sheet.append(each)
workbook.save(filename)
print(filename,'存储完成')
3:提取信息以及保存代码
- 1:这个是纯框架,在使用时,只需要添加网页内容提取有用信息的部分就可以了。
- 2:在主函数给参数的时候,第一个参数给文件扩展名为.csv就是保存为csv文件。给的文件扩展名为.xlsx那么保存得就是excel文件
# 导包
import requests
from bs4 import BeautifulSoup
import random
from savefile import Saving
from setting import USER_AGENT,IPS
import chardet
import re
import time
import csv
from openpyxl import Workbook
counts=1
# 获取网页内容(添加头部和代理IP)
def gethtml(url):
# 使用一个counts计数,当获取到达20次的时候提示一下,这个时候说明代理IP不能使用了,需要重新获取
global counts
# try一下当一次没有获取到的时候再获取几次
try:
response=requests.get(
url,
headers={'User-Agent':random.choice(USER_AGENT)},
proxies=random.choice(IPS)
)
#指定编码
response.encoding=chardet.detect(response.content)['encoding']
return response.text
except:
counts+=1
if counts>20:
print('老铁,你的ip真的没法用了')
return
else:
# 提示一下,是第几次获取
print('第',counts,'次重新抓取')
gethtml(url)
# 提取数据
def getData(html,filename,workbook,sheet):
# data是保存我们当前页的所有的详情页的超链接
data=[]
#得到beautifulsoup对象
soup=BeautifulSoup(html,'html.parser')
# 开始豆瓣电影中的详情页链接提取
'''
# 一个小案例提示
divs = soup.find_all('div', class_='item')
for i, each in enumerate(divs):
href = each.find('div', class_='pic').find('a').attrs['href']
data.append(href)
'''
#这里data保存得是详情页面的链接
return data
def getdata1(data,filename,workbook,sheet):
# 使用counts计数一下,当前是第几个详情页
counts1=0
for each in data:
counts1+=1
data1=[]
# 防止被反扒,每打开一个详情页,就休眠几秒钟(尽量长一点,这样被反扒的几率就越小)
time.sleep(2)
# 获取网页内容
html=gethtml(each)
# 获取当前详情页的beautifulsoup对象
soup1 = BeautifulSoup(html, 'html.parser')
# 开始提取网页信息
data1.append([])
# 没提取一条就保存一条
save(data1,filename,workbook,sheet)
# 输出提示一下,当前为第几个详情页面
print("保存第"+str(counts1)+"页的详情页成功")
# 创建自定义保存函数
def save(data,filename,workbook,sheet):
# 实例化对象,且 调用其方法
mysave=Saving()
# 这个时候需要判断是保存到excel还是csv
if 'xlsx' in filename:
mysave.savetoExcel(data,filename,workbook,sheet)
elif 'csv' in filename:
mysave.savetoCsv(data,filename)
else:
pass
# 自定义创建主函数
def main(filename,workbook,sheet):
# 因为头标题是就保存一次的所以放在最上面最先保存
alldata=[]
alldata.append(['','','','','','','','','','',''])
if 'xlsx' in filename:
save(alldata,filename,workbook,sheet)
elif 'csv' in filename:
save(alldata,filename,workbook,sheet)
# 这里是使用总的网页的第几页之间的链接数字来爬取多页的
for i in range(1):
time.sleep(3)
# 提示下当前为第几页
print('正在抓取第',i+1,'页')
# 中间是数字这个时候需要str转换一下,因为+只能在一种类型中进行
url = ''+str(i)+''
html=gethtml(url)
data=getData(html,filename,workbook,sheet)
getdata1(data, filename, workbook, sheet)
# 判断一下是不是调用的主函数。防止调用出错
if __name__=='__main__':
# 当保存为excel时这个时候,workbook工作簿,以及工作表都应该只创建一次,并且通过主函数依次传入保存到excel函数中
workbook = Workbook()
sheet = workbook.active
# 工作表的标题选择性给
sheet.title=""
# 此时第一个参数给的文件扩展名为 .csv 那么保存得就是csv文件。
# 此时第一个参数给的文件扩展名为 .xlsx 那么保存得就是excel文件。
main('',workbook,sheet)
4:使用框架爬取豆瓣Top250(不提取详情页)案例代码(仅供参考使用)
import requests
from bs4 import BeautifulSoup
import random
from savefile import Saving
from setting import USER_AGENT,IPS
import chardet
import re
import time
import csv
from openpyxl import Workbook
counts=1
def gethtml(url):
global counts
try:
response=requests.get(
url,
headers={'User-Agent':random.choice(USER_AGENT)},
proxies=random.choice(IPS)
)
#指定编码
response.encoding=chardet.detect(response.content)['encoding']
return response.text
except:
counts+=1
if counts>20:
print('老铁,你的ip真的没法用了')
return
else:
print('第',counts,'次重新抓取')
gethtml(url)
# 提取数据
def getData(html,filename,workbook,sheet):
data=[]
soup=BeautifulSoup(html,'html.parser')
# 开始豆瓣电影中的信息提取
divs=soup.find_all('div',class_='item')
for i,each in enumerate(divs):
#排名
rank=each.find('div',class_='pic').find('em').get_text()
# 电影链接
# href=each.find('div',class_='pic').find('a')['href'] #上下这两种方式都是可以的
href = each.find('div', class_='pic').find('a').attrs['href']
# 电影名称
filmname=each.find('div',class_='hd').find_all('span')[0].get_text()
# 导演
# 先获取整个字符串
wholetext=each.find('div',class_='bd').find('p').get_text()
# print(wholetext)
# 这次的正则有点6,之前提取两边唯一,这次两边可以多选
reg=re.compile('导演:(.*?)(\\xa0|/)')
director=re.findall(reg,wholetext)[0][0].strip()
# 主演.没有就去详情页里面爬取
reg1=re.compile('主演:(.*?)(/|\.)')
if re.findall(reg1,wholetext):
role=re.findall(reg1,wholetext)[0][0].strip()
else:
relo=''
# 上映日期
year_reg=re.compile('\d{4}')
year=re.findall(year_reg,wholetext)[0]
# 国家地区
# country_reg=re.compile('\\xa0(.*?)')
# country=re.findall(country_reg,wholetext)[0]
country=wholetext.split('\n')[2].split('/')[-2].strip()
#电影类型
filmtstyle=wholetext.split('\n')[2].strip().split('/')[-1]
# 评分
score=each.find('div',class_='star').find_all('span')[1].get_text()
# 评论人数
people=each.find('div',class_='star').find_all('span')[3].get_text()
# 简评
quote=each.find('p',class_='quote').get_text()
data.append([rank,href,filmname,director,role,year,country,filmtstyle,score,people,quote])
save(data,filename,workbook,sheet)
def save(data,filename,workbook,sheet):
# 实例化对象,且 调用其方法
mysave=Saving()
if 'xlsx' in filename:
mysave.savetoExcel(data,filename,workbook,sheet)
elif 'csv' in filename:
mysave.savetoCsv(data,filename)
else:
pass
def main(filename,workbook,sheet):
alldata=[]
alldata.append(['排名','链接','电影名','导演','主演','上映时间','国家地区','电影类型','评分','评论人数','简评'])
if 'xlsx' in filename:
save(alldata,filename,workbook,sheet)
elif 'csv' in filename:
save(alldata,filename,workbook,sheet)
for i in range(4):
time.sleep(3)
print('正在抓取第',i+1,'页')
url = 'https://movie.douban.com/top250?start='+str(i*25)+'&filter='
html=gethtml(url)
getData(html,filename,workbook,sheet)
if __name__=='__main__':
workbook = Workbook()
sheet = workbook.active
sheet.title="豆瓣"
main('douban.csv',workbook,sheet)