爬虫实战：12306登录

郗唯

2023-12-01

爬虫实战：破解点触验证码，实现12306登录

1.目标

实现12306登录，获取登录cookies

2.技术点

1.借用第三方打码平台，进行图片验证码识别
2.破解selenium webdriver反爬

3.思路

1.输入账号密码
2.获取验证图片
3.识别图片，获取坐标
4.图片验证
5.登录
6.滑动滑块

4.环境

python + selenium + 超级鹰

5.代码

1.12306登录.py

# @author: zly
# @function: Touch verification code
# @time: 2020-09-15
# @copyright: All Rights Reversed

import time
import random

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

from chaojiying import Chaojiying_Client
from constants import *


class MakeTrack:
    """
        Track generator, need to pass a distance parameter
    """

    def __init__(self, distance=DISTANCE):

        self.distance = distance

    def segmentate(self, s):
        """
            Track splitter, the size of each piece of track is not divided
            Returns a list object of a track block
            :param
                s --> Tracks to be segmented, int
        """

        if SEGMENTNUM1 <= abs(s) < SEGMENTNUM2:
            s = [round(s / 3) - 3, round(s / 3) + 3]

        elif abs(s) >= SEGMENTNUM2:
            s = [
                round(s / 5) - 5, round(s / 5) - 3,
                round(s / 5),
                round(s / 5) + 3, round(s / 5) + 5
            ]

        else:
            s = [round(s)]

        return s

    def make_track(self):

        """
            Make sliding track to simulate human normal movement
            Return a list object of sliding track
        """

        track = []

        current = v0 = 0

        while self.distance > current:
            # 随机事件，随机加速度，生成随机位移
            t = random.randint(1, 4) / 2
            a = random.randint(1, 3)

            # 速度、位移
            v0 += a * t
            s = v0 * t + 0.5 * a * t ** 2

            # 将不和规则的较大的位移进行分割
            seg = self.segmentate(round(s))
            track.extend(seg)

            current += s

        # 对不超过目标位移或者不足位移做补偿
        while True:

            if sum(track) == self.distance:
                break
            elif sum(track) > self.distance:
                track.pop()
            else:
                track.append(self.distance - sum(track))

        if len(track) > TRACKMAXLENGTH:
            self.make_track()

        return track


class Login12306(Chaojiying_Client):
    """
        :param
            username   12306账号    --> str
            password   12306密码    --> str
            cusername  超级鹰账号    --> str
            cpassword  超级鹰密码    --> str
            soft_id    软件ID       --> str
            codetype   验证类型      --> int
            path       验证码图片路径 --> str

        There are three to config your init configration
            1. by set constant
            2. by set config dict
            3. Direct set init configration
    """

    def __init__(
            self, username=None, password=None,
            cusername=None, cpassword=None, soft_id=None,
            codetype=None, path=None,
            *args, **kwargs
    ):

        # 配置优化，可以字典的形式传递参数
        if kwargs.get('configs', 'None'):

            # 连接超级鹰，初始化
            super().__init__(
                username=kwargs['configs'].get('cusername', ''),
                password=kwargs['configs'].get('cpassword', ''),
                soft_id=kwargs['configs'].get('soft_id', '')
            )

            self.username = kwargs['configs'].get('username', '')
            self.password = kwargs['configs'].get('password', '')
            self.cusername = kwargs['configs'].get('cusername', '')
            self.cpassword = kwargs['configs'].get('cpassword', '')
            self.soft_id = kwargs['configs'].get('soft_id', '')
            self.codetype = kwargs['configs'].get('codetype', '')
            self.path = kwargs['configs'].get('path', '')

        elif USERNAME:
            self.username = USERNAME
            self.password = PASSWORD
            self.cusername = CUSERNAME
            self.cpassword = CPASSWORD
            self.soft_id = SOFTID
            self.codetype = CODETIPE
            self.path = PATH

        else:
            # 连接超级鹰，初始化
            super().__init__(
                username=cusername,
                password=cpassword,
                soft_id=soft_id
            )

            self.username = username
            self.password = password
            self.cusername = cusername
            self.cpassword = cpassword
            self.soft_id = soft_id
            self.codetype = codetype
            self.path = path

        self.run

    @property
    def run(self):
        """
            You can call the run method directly for login verification,
            or you can also call other methods to achieve this function

            :return
                Return false means login verification failed
                Return true means login verification success
        """

        self.driver = self.prepares()

        self.driver.get('https://kyfw.12306.cn/otn/resources/login.html')

        self.driver.implicitly_wait(IMPLICITLYWAIT)
        self.driver.maximize_window()

        time.sleep(1)

        # 1.输入账号密码
        self.input_user_pwd(username=self.username, password=self.password)

        # 2.获取验证图片
        self.get_pic()

        while True:
            # 3.识别图片，获取坐标
            position, pic_id = self.get_position(codetype=self.codetype)

            if not position:
                position, pic_id = self.get_position(codetype=self.codetype)

            # 4.图片验证
            self.img_click(position)

            # 5.登录
            login = self.login(pic_id)
            if not login:

                self.driver.refresh()
                self.input_user_pwd(username=self.username, password=self.password)
                self.get_pic()

                continue

            # 6.滑动滑块
            return True if self.slide() else False

    def prepares(self):
        """
            Break through 12306 webriver
            Returns a webdrive after anti pickling
        """

        # 12306通过图片验证之后依然登陆不上，其中的原因是有webdriver反扒
        # 要想突破反扒，就必须修改带有webdrive的标志，我们用selenium打开的浏览器
        # 上面往往都会显示 Chrome正受到自动测试软件的控制
        # 因此我们需要修改Options和selenium浏览器的js标志navigator
        # selenium控制的浏览器默认是true/false，正常的是undefined

        options = webdriver.ChromeOptions()
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        driver = webdriver.Chrome(options=options)
        driver.execute_cdp_cmd(
            "Page.addScriptToEvaluateOnNewDocument",
            {
                "source": "Object.defineProperty("
                          "navigator, 'webdriver', "
                          "{get: () => undefined})"
            }
        )

        return driver

    def input_user_pwd(self, username=None, password=None):
        """
            Enter 12306 account and password

            @username: 12306账号 --> str, defalut is None
            @password: 12306密码 --> str, defalut is None

            The return 0 here has no effect, it just means the end of the function
        """

        # 切换至账号密码登录
        self.driver.find_element_by_xpath('//li[@class="login-hd-account"]/a').click()

        # 这里需要睡1-2秒，否则会报错，加载js，浏览器js没有代码快
        time.sleep(2)

        # 输入账号密码
        self.driver.find_element_by_id('J-userName').send_keys(username)
        self.driver.find_element_by_id('J-password').send_keys(password)

        return 0

    def get_pic(self):
        """
            Get touch captcha image
            The return 0 here has no effect, it just means the end of the function
        """

        # 截图
        self.driver.find_element_by_id('J-loginImg').screenshot(self.path)

        return 0

    def get_position(self, codetype=None):
        """
            Get the touch coordinates of super Eagle verification

            @soft_id: 软件ID      --> str, defalut is None
            @codetype: 验证类型    --> int, defalut is None

            :return
                a list object [position, pic_id]
        """

        # 发送图片，获取坐标是
        verify_data = self.PostPic(self.path, codetype)
        print(verify_data)

        # 如果成功获取坐标则格式化，否则return None
        if verify_data['err_no'] == 0:

            temp = verify_data['pic_str'].split('|')
            position = [i.split(',') for i in temp]

            return [position, verify_data['pic_id']]

        else:

            self.ReportError(verify_data['pic_id'])
            return [None, verify_data['pic_id']]

    def img_click(self, position):
        """
            Get the touch coordinates of super Eagle verification
            @position: 点触坐标 --> Nested list, [['55', '55'], ['88', '88']...]
            The return 0 here has no effect, it just means the end of the function
        """

        # 要点触的图片
        element = self.driver.find_element_by_id('J-loginImg')

        # 按照坐标值点击
        for k in position:

            # x、y需要int的原因：move_to_element_with_offset中x、y只能是int型
            x = int(k[0])
            y = int(k[1])

            ActionChains(self.driver).move_to_element_with_offset(element, x, y).click().perform()

        return 0

    def login(self, pic_id=None):
        """
            Its role is to log in and get cookies
            Return true means the verification is successful, otherwise it fails
        """

        # 登录，获取cookies
        self.driver.find_element_by_id('J-login').click()

        # 判断图片验证是否验证成功
        verify_tag = self.driver.find_element_by_xpath('//*[@class="lgcode-error"]')

        # 看verify_tag的display属性是否可见，可见则表示验证失败
        if verify_tag.is_displayed():

            # 别浪费钱，向超级鹰报个错
            self.ReportError(pic_id)
            print("图片验证失败，报错成功")

            return False

        print("图片验证成功")
        time.sleep(3)

        return True

    def slide(self):

        """
            Sliding verification,
            if it's successful return cookies, or return False
        """

        try:

            # 定位滑块
            element = self.driver.find_element_by_id('nc_1_n1z')
            # 生成轨迹
            track = MakeTrack().make_track()

            # 滑动
            ActionChains(self.driver).click_and_hold(element).perform()
            [ActionChains(self.driver).move_by_offset(i, 0).perform() for i in track]
            ActionChains(self.driver).release(element).perform()

            # 时间取决于网速
            time.sleep(5)

        except Exception as e:

            # stale element reference: element is not attached to the page document
            # 页面刷新导致获取不到元素，若能够滑动通过此错误无需再管，不是每次都会发生

            print(str(e))
            time.sleep(10)
            self.driver.quit()

            return False

        # 判断是否登陆成功
        try:

            self.driver.find_element_by_xpath('//*[@class="btn btn-primary ok"]').click()
            cookies = self.driver.get_cookies()

            print("恭喜您登陆成功")
            print(cookies)
            time.sleep(10)

            self.driver.quit()

            return True

        except Exception as e:

            print(str(e))
            print("恭喜您登陆失败，再来一次吧")

            time.sleep(10)
            self.driver.quit()

            return False


configs = {
    'username': '',      # 12306账号
    'password': '',      # 12306密码
    'cusername': '',     # 超级鹰账号
    'cpassword': '',     # 超级鹰密码
    'soft_id': '',       # 软件ID
    'codetype': 9004,    # 验证类型
    'path': ''           # 验证码图片路径
}


Login12306(configs=configs)

2、chaojiying.py

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):

        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id

        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }

        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, path, codetype):
        """
            path: 图片路径
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """

        with open(path, 'rb') as f:
            imagecontent = f.read()

        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', imagecontent)}

        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)

        return r.json()

    def ReportError(self, im_id):
        """
            im_id:报错题目的图片ID
        """

        params = {
            'id': im_id,
        }
        params.update(self.base_params)

        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)

        return r.json()

3、contants.py

# 12306账号
USERNAME = ''

# 12306密码
PASSWORD = ''

# 超级鹰账号
CUSERNAME = ''

# 超级鹰密码
CPASSWORD = ''

# 软件ID
SOFTID = ''

# 验证类型
CODETIPE = ''

# 验证码图片路径
PATH = ''

# 滑块滑动的距离，单位：px
DISTANCE = 425

# 轨迹分割规定大小
SEGMENTNUM1 = 30
SEGMENTNUM2 = 50

# 轨迹最大段数
TRACKMAXLENGTH = 30

# # 显性等待时间，单位：s
IMPLICITLYWAIT = 10

爬虫实战：12306登录

爬虫实战：破解点触验证码，实现12306登录

1.目标

2.技术点

3.思路

4.环境

5.代码

温馨提示：千万不要干坏事喲~~，否则抓进局里后果自负…

相关阅读

相关文章

相关问答

相关文档