一、环境配置
1、requests: pip3 install requests
2、bs4:pip3 install bs4
二、爬虫详解
1、引入包
import requests
from bs4 import BeautifulSoup
2、加载网页
response=requests.get('https://ip38.com/ip.php?ip=1.180.1.202') homepage=response.content.decode()
3、创建BeautifulSoup
soup = BeautifulSoup(homepage)
4、解析标签
1)下面是详细的元素查找方法
# soup.
# find、findall
# id='top_progress_prev'
# attrs={'class':'notice'}
# name='标签名'
# class_ = 'item-1'
如:
a= soup.find('a')
print(a)
print('a.text(返回标签的内容):',a.text)
print('a.attrs(返回标签的属性):',a.attrs)
print('a.string(返回标签内容为字符串):',a.string)
top=soup.find(id='top_progress_prev')
print(top)
# 第一种:在attrs属性用字典进行传递参数
find_class = soup.find(attrs={'class':'notice'})
li_quick = soup.find_all(attrs={'color':'#FF0000'})
print(li_quick[1].text)
for li_quick in li_quick:
print('最灵活的查找方法:',li_quick.text)
2) 获取标签值
# tag.name 获取标签名
# tag['class']获取属性值
# tag.attrs 获取全部属性
# tag.text 获取文本
三、项目案例
1)爬ip对应城市
import platform
import requests
import threading
from bs4 import BeautifulSoup
def http_request(url):
response=requests.get(url)
homepage=response.content.decode()
return BeautifulSoup(homepage)
def write_date(file_name,txt):
text_file = open(file_name, "a")
text_file.write(txt)
text_file.close()
def ip_address(i):
for j in range(255):
url='https://ip38.com/ip.php?ip='
ip=str(i)+'.'+str(i)+'.1.202'
url1=url+ip
print(ip)
soup=http_request(url1)
li_quick = soup.find_all(attrs={'color':'#FF0000'})
print(li_quick[1].text)
write_date('ip_address.txt',ip+':'+ li_quick[1].text+'\n')
def main():
for i in range(255):
threading.Thread(target=ip_address,args=(i,)).start()
if __name__ == '__main__':
main()
2)爬取福彩3d数据
import requests
from bs4 import BeautifulSoup
def http_request(url):
response=requests.get(url)
homepage=response.content.decode()
return BeautifulSoup(homepage)
def write_date(file_name,txt):
text_file = open(file_name, "a")
text_file.write(txt)
text_file.close()
def cp_pass(list):
for tr in list:
i=0
text=''
for td in tr.find_all('td'):
if i==2:
em=td.find_all('em')
for v in em:
text=text+v.text+"|"
break
text=text+td.text+"|"
i=i+1
if i==2:
write_date('cp.txt',text+'\n')
def main():
for i in range(1000):
flag=True
while flag:
flag=False
url='http://kaijiang.zhcw.com/zhcw/inc/3d/3d_wqhg.jsp?pageNum='+str(i)
print(url)
soup=http_request(url)
# print(soup)
try:
list=soup.find(attrs={'class':"wqhgt"}).find_all('tr')
except:
flag=True
cp_pass(list)
if __name__ == '__main__':
main()
3)爬微博粉丝数
import requests
from bs4 import BeautifulSoup
def http_request(url,header,cookie):
response=requests.get(url,headers=header,cookies=cookie)
homepage=response.content.decode()
return BeautifulSoup(homepage,features='html.parser')
def write_date(file_name,txt):
text_file = open(file_name, "a")
text_file.write(txt)
text_file.close()
def ip_address(i):
for j in range(255):
url='https://ip38.com/ip.php?ip='
ip=str(i)+'.'+str(i)+'.1.202'
url1=url+ip
print(ip)
soup=http_request(url1)
li_quick = soup.find_all(attrs={'color':'#FF0000'})
print(li_quick[1].text)
write_date('ip_address.txt',ip+':'+ li_quick[1].text+'\n')
def main():
url='https://s.weibo.com/weibo?q=dewu'
print(url)
cookies = {
"cookie": 'SINAGLOBAL=4457818576084.978.1657611427780; SUB=_2AkMVgrYDf8NxqwFRmP8Tz23lboVwyQ7EieKj3kfYJRMxHRl-yT9jqmontRB6PgKY7CD-cYFgKfSTtDPE12CIHExvESIo; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5yk4-iH2QfJ-qPq_RE7ex0; _s_tentry=weibo.com; Apache=1049933405515.866.1660814785450; ULV=1660814785458:2:1:1:1049933405515.866.1660814785450:1657611427791'
}
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Referer": "https://weibo.com"
}
soup=http_request(url,header,cookies)
# print(soup)
pp=soup.find(attrs={'class':'s-nobr'})
print(pp.text)
# li_quick = soup.find_all(attrs={'color':'#FF0000'})
# print(li_quick[1].text)
# write_date('ip_address.txt',ip+':'+ li_quick[1].text+'\n')
if __name__ == '__main__':
main()