requests爬虫

劳法
2023-12-01

一、环境配置

1、requests: pip3 install requests   

2、bs4:pip3 install bs4   

二、爬虫详解

1、引入包

import requests
from bs4 import BeautifulSoup

2、加载网页

response=requests.get('https://ip38.com/ip.php?ip=1.180.1.202')
homepage=response.content.decode()

3、创建BeautifulSoup

soup = BeautifulSoup(homepage)

4、解析标签
1)下面是详细的元素查找方法
# soup.
# find、findall
# id='top_progress_prev'
# attrs={'class':'notice'}
# name='标签名'
# class_ = 'item-1'

如:
a= soup.find('a')
print(a)
print('a.text(返回标签的内容):',a.text)
print('a.attrs(返回标签的属性):',a.attrs)
print('a.string(返回标签内容为字符串):',a.string)

top=soup.find(id='top_progress_prev')
print(top)
# 第一种:在attrs属性用字典进行传递参数
find_class = soup.find(attrs={'class':'notice'})
li_quick = soup.find_all(attrs={'color':'#FF0000'})
print(li_quick[1].text)
for li_quick in li_quick:
    print('最灵活的查找方法:',li_quick.text)
2) 获取标签值
# tag.name 获取标签名
# tag['class']获取属性值
# tag.attrs 获取全部属性
# tag.text 获取文本

三、项目案例

1)爬ip对应城市

import platform
import requests
import threading
from bs4 import BeautifulSoup
def http_request(url):
    response=requests.get(url)
    homepage=response.content.decode()
    return  BeautifulSoup(homepage)

def write_date(file_name,txt):
    text_file = open(file_name, "a")
    text_file.write(txt)
    text_file.close()

def ip_address(i):
    for j in range(255):
        url='https://ip38.com/ip.php?ip='
        ip=str(i)+'.'+str(i)+'.1.202'
        url1=url+ip
        print(ip)
        soup=http_request(url1)
        li_quick = soup.find_all(attrs={'color':'#FF0000'})
        print(li_quick[1].text)
        write_date('ip_address.txt',ip+':'+ li_quick[1].text+'\n')

def main():
    for i in range(255):
        threading.Thread(target=ip_address,args=(i,)).start()


if __name__ == '__main__':
    main()

2)爬取福彩3d数据

import requests
from bs4 import BeautifulSoup
def http_request(url):
    response=requests.get(url)
    homepage=response.content.decode()
    return  BeautifulSoup(homepage)

def write_date(file_name,txt):
    text_file = open(file_name, "a")
    text_file.write(txt)
    text_file.close()

def cp_pass(list):
    for tr in list:
        i=0
        text=''
        for td in tr.find_all('td'):
            if i==2:
                em=td.find_all('em')
                for v in em:
                    text=text+v.text+"|"
                break
            text=text+td.text+"|"
            i=i+1
        if i==2:
            write_date('cp.txt',text+'\n')

def main():
    for i in range(1000):
        flag=True
        while flag:
            flag=False
            url='http://kaijiang.zhcw.com/zhcw/inc/3d/3d_wqhg.jsp?pageNum='+str(i)
            print(url)
            soup=http_request(url)
            # print(soup)
            try:
                list=soup.find(attrs={'class':"wqhgt"}).find_all('tr')
            except:
                flag=True

        cp_pass(list)

if __name__ == '__main__':
    main()

3)爬微博粉丝数

import requests
from bs4 import BeautifulSoup
def http_request(url,header,cookie):
    response=requests.get(url,headers=header,cookies=cookie)
    homepage=response.content.decode()
    return  BeautifulSoup(homepage,features='html.parser')

def write_date(file_name,txt):
    text_file = open(file_name, "a")
    text_file.write(txt)
    text_file.close()

def ip_address(i):
    for j in range(255):
        url='https://ip38.com/ip.php?ip='
        ip=str(i)+'.'+str(i)+'.1.202'
        url1=url+ip
        print(ip)
        soup=http_request(url1)
        li_quick = soup.find_all(attrs={'color':'#FF0000'})
        print(li_quick[1].text)
        write_date('ip_address.txt',ip+':'+ li_quick[1].text+'\n')

def main():
    url='https://s.weibo.com/weibo?q=dewu'
    print(url)
    cookies = {
        "cookie": 'SINAGLOBAL=4457818576084.978.1657611427780; SUB=_2AkMVgrYDf8NxqwFRmP8Tz23lboVwyQ7EieKj3kfYJRMxHRl-yT9jqmontRB6PgKY7CD-cYFgKfSTtDPE12CIHExvESIo; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5yk4-iH2QfJ-qPq_RE7ex0; _s_tentry=weibo.com; Apache=1049933405515.866.1660814785450; ULV=1660814785458:2:1:1:1049933405515.866.1660814785450:1657611427791'
    }
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
        "Referer": "https://weibo.com"
    }

    soup=http_request(url,header,cookies)
    # print(soup)
    pp=soup.find(attrs={'class':'s-nobr'})
    print(pp.text)
    # li_quick = soup.find_all(attrs={'color':'#FF0000'})
    # print(li_quick[1].text)
    # write_date('ip_address.txt',ip+':'+ li_quick[1].text+'\n')

if __name__ == '__main__':
    main()

        

 类似资料: