当前位置: 首页 > 工具软件 > httpkit > 使用案例 >

自定义HTTP工具包:httpkit.py

百里飞捷
2023-12-01

导入内置模块

urllib.request

urllib.parse

http.cookiejar

ssl

socket

gzip

random

uas.py模块

"""
@reated: 2022/3/25 Fri.
@by: MR.N
@blog: https://blog.csdn.net/qq_21264377
@version: 1.0
"""

user_agent_list = [
    'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) '
    'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
    'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) '
    'Chrome/20.0.1132.57 Safari/536.11',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
    'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
    'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
    'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) '
    'Chrome/19.0.1063.0 Safari/536.3',
    'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
    'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
    'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
    'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) '
    'Chrome/19.0.1055.1 Safari/535.24'
]

httpkit.py模块

常量

SOCKET_TIMEOUT

SOCKET_TIMEOUT = 30

HTTPS_TIMEOUT

HTTPS_TIMEOUT = 7

函数

valid_https

def valid_https(url=None):
	if url is None:
        return False
    if '/' not in url:
        return False
    paths = url.split('/')
    if paths[0] not in ['http:', 'https:']:
        return False
    if paths[1] != '':
        return False
    if paths[2] is None:
        return False
    if '.' not in paths[2]:
        return False
    host_paths = paths[2].split('.')
    if len(host_paths) < 2:
        return False
    return True

split_path

def split_path(url, separator='/'):
	return url.split(separator)

get_host

def get_host(paths):
    host = '/'.join(paths[:3])
    return host

get_current_dir

def get_current_dir(paths):
    current_dir = '/'.join(paths[:-1])
    return current_dir

pad

说明

URL地址分为完整地址和缩写地址。完整地址包含主机地址、路径,而缩写地址只包含路径。缩写路径是相对于当前请求网页地址的网络资源描述。因此,缩写路径需要根据当前网页在指定主机地址的网站目录填充为完整的地址。
常见缩写地址,分为三种格式://host/path、/path和path。第一种缺失访问协议,在这里设为 http 或 https,与当前网页URL的访问协议相同;第二种则相对于主机地址的网站根目录而言;第三种是相对于当前网页地址所在目录的路径。

代码

# -*- coding: utf-8 -*-
"""
@file: httpkit
@author: MR.N
@created: 2022/4/2 4月
@version: 1.0
@blog: https://blog.csdn.net/qq_21264377
"""


def pad(url, links):
    if not valid_https(url):
        return []
    paths = split_path(url)
    host = get_host(paths) + '/'
    current_dir = get_current_dir(paths) + '/'
    res = []
    for link in links:
        if link.startswith('http://') or link.startswith('https://') or link.startswith('ftp://'):
            pass
        elif link.startswith('//'):
            link = paths[0] + link
        elif link.startswith('/'):
            link = host + link[1:]
        else:
            link = current_dir + link
        res.append(link)
    return res

mask_host

说明

mask_host()函数是为了在命令行打印结果时,省略主机名称。

代码

def mask_host(url=None):
    if not valid_https(url):
        return ''
    paths = url.split('/')
    host = paths[2]
    if '.' in host:
        host_paths = host.split('.')
        length = len(host_paths)
        if length == 2:
            host_paths[0] = '***'
        else:
            host_paths[1] = '***'
        host = '.'.join(host_paths)
    else:
        host = '***'
    paths[2] = host
    masked = '/'.join(paths)
    return masked

RemoteTask

class RemoteTask:
    def __init__(self, url=None, referer=None, cookies=None):
        self.url = url
        self.referer = referer
        self.cookies = cookies

    def __del__(self):
        self.url = None
        self.referer = None
        self.cookies = None
        del self.cookies
        del self.referer
        del self.url

unspecific_ua

def unspecific_ua():
    return random.choice(user_agent_list)

get_res_objects

说明

get_res_objects()函数返回三个值(data, url, cookies),分别是HTML源码文本(或二进制)、当前请求URL和Cookies。

代码

# -*- coding: utf-8 -*-
"""
@file: httpkit
@author: MR.N
@created: 2022/4/2 4月
@version: 1.0
@blog: https://blog.csdn.net/qq_21264377
"""


def get_res_objects(remote_task=None, dtype=0):
    if remote_task is None:
        return '', '', ''
    if not isinstance(remote_task, RemoteTask):
        return '', '', ''
    url = remote_task.url
    if not valid_https(url):
        return '', '', ''
    referer = remote_task.referer
    cookies = remote_task.cookies
    ua = unspecific_ua()
    headers = {
        'User-Agent': ua,
        # 'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Encoding': 'gzip, deflate',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Site': 'none',
        'Upgrade-Insecure-Requests': '1',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive'}
    if referer is None:
        headers['Referer'] = url
    else:
        headers['Referer'] = referer
    if cookies is not None:
        headers['Cookie'] = cookies
    socket.setdefaulttimeout(SOCKET_TIMEOUT)
    ssl._create_default_https_context = ssl._create_unverified_context
    request = urllib.request.Request(url, headers=headers)
    cookie_kit = http.cookiejar.CookieJar()
    cookie_handler = urllib.request.HTTPCookieProcessor(cookie_kit)
    opener = urllib.request.build_opener(cookie_handler)
    response = opener.open(request, timeout=HTTPS_TIMEOUT)
    cookie_res = ''
    for cookie in cookie_kit:
        cookie_res += cookie.name + '=' + cookie.value + ';'
    # print(f'cookies={cookie_res}')
    if response.getcode() == 200:
        data = response.read()
        if dtype == 1:
            return data, url, cookie_res        
        if data is not None:
            content_encoding = response.getheader('Content-Encoding')
            if content_encoding is not None and content_encoding.lower() in ['gzip', 'deflate']:
                data = gzip.decompress(data).decode('UTF-8', errors='strict')
            else:
                data = data.decode('UTF-8', errors='strict')
            return data, url, cookie_res
        else:
            return '', url, cookie_res
    else:
        return '', url, cookie_res
        

get_res_objects2

说明

get_res_objects2()函数返回状态码,通过传入数组参数获得三个返回值(data, url, cookies),分别是HTML源码文本(或二进制)、当前请求URL和Cookies。

代码

def get_res_objects2(remote_task=None, ret=[], dtype=0):
    if remote_task is None:
        ret += ['', '', '']
        return 'err'
    if not isinstance(remote_task, RemoteTask):
        ret += ['', '', '']
        return 'err'
    url = remote_task.url
    if not valid_https(url):
        ret += ['', '', '']
        return 'err'
    referer = remote_task.referer
    cookies = remote_task.cookies
    ua = unspecific_ua()
    headers = {
        'User-Agent': ua,
        # 'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Encoding': 'gzip, deflate',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Site': 'none',
        'Upgrade-Insecure-Requests': '1',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive'}
    if referer is None:
        headers['Referer'] = url
    else:
        headers['Referer'] = referer
    if cookies is not None:
        headers['Cookie'] = cookies
    socket.setdefaulttimeout(SOCKET_TIMEOUT)
    ssl._create_default_https_context = ssl._create_unverified_context
    request = urllib.request.Request(url, headers=headers)
    cookie_kit = http.cookiejar.CookieJar()
    cookie_handler = urllib.request.HTTPCookieProcessor(cookie_kit)
    opener = urllib.request.build_opener(cookie_handler)
    response = opener.open(request, timeout=HTTPS_TIMEOUT)
    cookie_res = ''
    for cookie in cookie_kit:
        cookie_res += cookie.name + '=' + cookie.value + ';'
    # print(f'cookies={cookie_res}')
    if response.getcode() == 200:
        data = response.read()
        if dtype == 1:
            ret += [data, url, cookies]
            return 'success'
        if data is not None:
            content_encoding = response.getheader('Content-Encoding')
            if content_encoding is not None and content_encoding.lower() in ['gzip', 'deflate']:
                data = gzip.decompress(data).decode('UTF-8', errors='strict')
            else:
                data = data.decode('UTF-8', errors='strict')
            ret += [data, url, cookies]
            return 'success'
        else:
            ret += ['', '', '']
            return 'err'
    else:
        ret += ['', '', '']
        return 'err'
        
 类似资料: