准备
- python IDLE
- robots的使用:域名后追加robots.txt可查看
- 安装requests库:管理员启动cmd命令pip install requests
- 安装beautifulsoup4库:管理员启动cmd命令pip install beautifulsoup4
实例
- 爬取单张图片
import requests
import os
url = "https://i0.hippopx.com/photos/320/918/427/sky-clouds-sunlight-dark-thumb.jpg"
root = "D://pics//"
def getPics(url,root):
path = root + url.split('/')[-1] #以'/'为分割符保留最后一段
try:
#若跟路径不存在,则创建
if not os.path.exists(root):
os.mkdir(root)
#若文件路径不存在,则创建,否则提示已保存
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
- 爬取大学排名
import requests
import bs4
from bs4 import BeautifulSoup
#获取网页文本
def getHTMLText(url):
try:
r = requests.get( url,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
#提取排名信息
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children: #查找'tbody'的子孙节点
if isinstance(tr,bs4.element.Tag): #判断'tr'是否为标签类型
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
#显示前num个排名
def printUnivList(ulist,num):
#格式输出
print("{:^6}\t{:<10}\t{:^6}".format("排名","学校名称","总分"))
for i in range(num):
u = ulist[i]
print("{:^6}\t{:<10}\t{:^6}".format(u[0],u[1],u[2]))
def main():
uinfo = []
url = "http://www.zuihaodaxue.com/Greater_China_Ranking2019_0.html"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,50)
main()
- 爬取豆瓣电压Top250
import requests
import bs4
import random
from bs4 import BeautifulSoup
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
user_list = (
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
)
#获取网页文本
def getHTMLText(url,user_agent):
try:
r = requests.get(url, headers=user_agent, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "爬取失败"
#提取排名信息
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for li in soup.find('ol').children: #查找'ol'的子孙节点
if isinstance(li,bs4.element.Tag): #判断'li'是否为标签类型
no = li('em')
spans = li('span')
score = li('span','rating_num')
ulist.append([no[0].string, spans[0].string,score[0].string])
#显示前num个排名
def printUnivList(ulist,num):
#格式输出,chr(12288)表示使用中文填充,解决对齐问题
for i in range(num):
u = ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
def main():
uinfo = []
#随机请求头防止反爬
user_agent = random.choice(user_list)
start_url = "https://movie.douban.com/top250?start="
page = 0
print(tplt.format("排名","影片名称","总分",chr(12288)))
for i in range(10):
#拼接url
url = start_url + str(page) + "&filter="
html = getHTMLText(url,user_agent)
fillUnivList(uinfo,html)
page+=25
printUnivList(uinfo,page)
main()
- 爬取淘宝商品
import random
import requests
import re
my_headers = {
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.204 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'cache-control': 'max-age=0',
'authority': 's.taobao.com',
'cookie': 'cna=Tql9FFpD1BICAXE5sLfHz1uP; tracknick=%5Cu51AC%5Cu6696%5Cu590F%5Cu51C9an; tg=0; enc=oHD1Xeg6OXuTiSJD5wGcdhE7YcKsPUjqsvWpi7CaWnpolSL%2F8ZF0oVJN0ZMhcsUiP06eeZ2YU7N%2BKxLtqeVTbQ%3D%3D; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; miid=1269960616169477290; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; t=90fe486baa6f30adf6f5fa32d2be3a20; _m_h5_tk=4455c3bc90f54ff85d75e7e02d5df6e7_1582003278360; _m_h5_tk_enc=6a767ca75b75e9ebe425a0892b2f9eb8; mt=ci%3D-1_0; cookie2=1fd9b17f075b2651c7141bade62f8b76; _tb_token_=ed33e56d3e8ee; v=0; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; JSESSIONID=2B4CC538D7AEA9C0F49C578476A69C9B; l=cBSwiHYlq4x7AMukBOCN5uI8aO7OSIRYouPRwNVXi_5dv6L_-EbOo5mpdFp6VjWdtZTB4PLEbq99-etkNLe06Pt-g3fP.; isg=BOPj1-iGuYHGzHZiETSah_F2cieN2HcatOqyUBVAP8K5VAN2nagHasGESiTacs8S',
}
#html请求,为防止跳转到登录页面,请求报文中包含headers
def getHTMLText(url,my_headers):
try:
r = requests.get(url,headers=my_headers,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "爬取失败"
#解析页面内容
def parasPage(ilt,html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
print("paras错误")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count = 0
for g in ilt:
count += 1
print(tplt.format(count,g[0],g[1]))
def main():
#搜索关键词
goods='书包'
#爬取深度(页数)
depth=2
start_url='https://s.taobao.com/search?q='+goods
infoList=[]
for i in range(depth):
try:
url=start_url + '&s=' +str(44*i)
html=getHTMLText(url,my_headers)
parasPage(infoList,html)
except:
continue
printGoodsList(infoList)
main()
- 爬取股票信息
import requests
import re
from bs4 import BeautifulSoup
#基本框架
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "爬取失败"
#爬取页面的股票信息
def parasStockPage(html,slist):
#对标签树整理
soup = BeautifulSoup(html,'html.parser')
#查找a标签
a = soup.find_all('a')
for i in a:
try:
#提取股票代码,找不到就跳过
isStock = re.search(r'\(\d{6}\)',i.string)
if isStock:
#若找到,提取股票信息
code = re.search(r'\d{6}',i.string)
name = re.sub(r'\(\d{6}\)','',i.string)
#将信息存入列表
slist.append([name,code.group()])
except:
continue
def printStockList(slist):
#字符串格式输出
tplt = "{:4}\t{:12}\t{:10}"
count=0
print(tplt.format("序号","股票名称","股票代码"))
for s in slist:
count += 1
print(tplt.format(count,s[0],s[1]))
def main():
stock_list_url = "http://quote.eastmoney.com/stock_list.html"
slist = []
html = getHTMLText(stock_list_url)
parasStockPage(html,slist)
printStockList(slist)
main()
- 批量爬取图片
import requests
import random
import time
from bs4 import BeautifulSoup
import re
import os
#url_begin = "http://www.win4000.com/meinvtag352_1.html"
#root = "D://pics//"
user_list = (
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1"},
{'user-agent': "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"},
{'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
)
#获取网页文本
def getHTMLText(url,user_agent):
try:
r = requests.get(url, headers=user_agent, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "爬取失败"
def getPicsUrl(htmlText,picsUrls):
try:
#re.findall返回列表类型
isPicsUrls = re.findall(r'http://www.win4000.com/meinv\d{6}.html',htmlText)
if isPicsUrls:
picsUrls.extend(isPicsUrls)
except:
print("没有找到图片集URL")
#依次进入图片集,获取原图片url
def getPicUrl(picsUrls,picUrls):
for i in range(len(picsUrls)):
time.sleep(1)
user_agent = random.choice(user_list)
htmlText = getHTMLText(picsUrls[i],user_agent)
soup = BeautifulSoup(htmlText,'html.parser')
try:
ul = soup.find(id = 'scroll')
#isPicUrls缺省'.jpg'
isPicUrls = re.findall(r'http://pic1.win4000.com/pic/\w/\w{2}/[a-z0-9]+',str(ul))
if isPicUrls:
picUrls.extend(isPicUrls)
except:
print('getPicUrl异常')
#依次get图片链接,保存图片
def savePics(picUrls,root):
for i in range(len(picUrls)):
picUrls[i] = picUrls[i]+".jpg"
path = root + str(i) +".jpg" #以'/'为分割符保留最后一段
time.sleep(1)
try:
#若跟路径不存在,则创建
if not os.path.exists(root):
os.mkdir(root)
#若文件路径不存在,则创建,否则提示已保存
if not os.path.exists(path):
r = requests.get(picUrls[i])
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("文件错误")
def main():
root = "D://pics//"
picsUrls = []#图片集url
picUrls = []#图片url
#爬取页数,照片集总共5页
pages = 5
#获取照片集的入口URL,i从0开始
for i in range(pages):
print("正在爬取:"+str(i) )
#拼接每一页的url
time.sleep(2)
i=i+1
url = "http://www.win4000.com/meinvtag352_"+ str(i) +".html"
user_agent = random.choice(user_list)
#获取每一页的请求返回内容
htmlText = getHTMLText(url,user_agent)
#分析每一页html,获取照片集URL,加入到列表
getPicsUrl(htmlText,picsUrls)
#进入照片集,取得每一张照片
print('正在getPicUrl')
getPicUrl(picsUrls,picUrls)
print('正在savePics')
savePics(picUrls,root)
main()