01-12306-抓取验证码

夏长卿
2023-12-01
# utf-8 __*__
import requests
from bs4 import BeautifulSoup
import time
from PIL import Image
from io import BytesIO

def getIPPool(ip_url):
    '''
    从IP网站网站获取动态IP
    :param ip_url: 获取IP地址的服务器
    :return: 以字典的方式返回IP池
    '''
    ip_response = requests.get(ip_url, headers=headers).text
    ip_soup = BeautifulSoup(ip_response, 'lxml')
    tag_tr = ip_soup.find_all('tr')
    IPPool = []

    for tr in tag_tr:
        '''
        抓取代理IP，以字典列表的方式保存IP相关信息
        '''
        ip_info = {
            'ip1': '',
            'port2': '',
            'location3': '',
            'type5': '',
            'live_time6': '',
            'verify_time7': ''
        }

        if len(tr.find_all('td')) == 8:
            try:
                #print(tr.find_all('td'))
                ip_info['ip1'] = tr.find_all('td')[1].contents[0]
                ip_info['port2'] = tr.find_all('td')[2].contents[0]
                ip_info['location3'] = tr.find_all('td')[3].contents[0]
                ip_info['type5'] = tr.find_all('td')[5].contents[0]
                ip_info['live_time6'] = tr.find_all('td')[6].contents[0]
                ip_info['verify_time7'] = tr.find_all('td')[7].contents[0]
                IPPool.append(ip_info)
            except:
                pass
    return IPPool

def saveCaptch(IPPool):
    '''
    
    :param IPPool:传入IP池 
    通过代理IP，抓取12306验证码，并保存本地，待后续分析12306验证码
    '''
    for ip in IPPool:
        proxies = {
            'http': 'https://'+ip['ip1']
        }
        print(proxies)
        try:
            r = requests.get(captcha_12306, headers=headers, proxies=proxies,timeout = 5)
            if r.status_code == 200:
                image = Image.open(BytesIO(r.content))
                image.save(r'F:\\12306\\' + time.strftime("%Y%m%d %H%M%S",time.localtime()) + '.png')
                print('successed!')
                #image.show()
                time.sleep(1)
            else:
                continue
        except:
            print("抓取失败")
            pass

if __name__ == '__main__':
    test_url = 'https://www.baidu.com/'
    ip_url = 'http://www.xicidaili.com/'
    captcha_12306 = 'https://kyfw.12306.cn/passport/captcha/captcha-image'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)\
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36', }
    counter = 0
    print(u'初始化成功')
    while True:
        '''
        循环5次，更新一次IP池
        '''
        IPPool = getIPPool(ip_url=ip_url)
        print(u'更新IP池成功...')
        for i in range(5):
            print(u'抓取验证码中.....')
            saveCaptch(IPPool=IPPool)
            print(counter)
            counter += 1
01-12306-抓取验证码

相关阅读

相关文章

相关问答

相关文档