"""
@reated: 2022/3/25 Fri.
@by: MR.N
@blog: https://blog.csdn.net/qq_21264377
@version: 1.0
"""
user_agent_list = [
'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) '
'AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/67.0.3396.87 Mobile Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
'Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) '
'Chrome/20.0.1132.57 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) '
'Chrome/19.0.1063.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) '
'Chrome/19.0.1055.1 Safari/535.24'
]
SOCKET_TIMEOUT = 30
HTTPS_TIMEOUT = 7
def valid_https(url=None):
if url is None:
return False
if '/' not in url:
return False
paths = url.split('/')
if paths[0] not in ['http:', 'https:']:
return False
if paths[1] != '':
return False
if paths[2] is None:
return False
if '.' not in paths[2]:
return False
host_paths = paths[2].split('.')
if len(host_paths) < 2:
return False
return True
def split_path(url, separator='/'):
return url.split(separator)
def get_host(paths):
host = '/'.join(paths[:3])
return host
def get_current_dir(paths):
current_dir = '/'.join(paths[:-1])
return current_dir
URL地址分为完整地址和缩写地址。完整地址包含主机地址、路径,而缩写地址只包含路径。缩写路径是相对于当前请求网页地址的网络资源描述。因此,缩写路径需要根据当前网页在指定主机地址的网站目录填充为完整的地址。
常见缩写地址,分为三种格式://host/path、/path和path。第一种缺失访问协议,在这里设为 http 或 https,与当前网页URL的访问协议相同;第二种则相对于主机地址的网站根目录而言;第三种是相对于当前网页地址所在目录的路径。
# -*- coding: utf-8 -*-
"""
@file: httpkit
@author: MR.N
@created: 2022/4/2 4月
@version: 1.0
@blog: https://blog.csdn.net/qq_21264377
"""
def pad(url, links):
if not valid_https(url):
return []
paths = split_path(url)
host = get_host(paths) + '/'
current_dir = get_current_dir(paths) + '/'
res = []
for link in links:
if link.startswith('http://') or link.startswith('https://') or link.startswith('ftp://'):
pass
elif link.startswith('//'):
link = paths[0] + link
elif link.startswith('/'):
link = host + link[1:]
else:
link = current_dir + link
res.append(link)
return res
mask_host()函数是为了在命令行打印结果时,省略主机名称。
def mask_host(url=None):
if not valid_https(url):
return ''
paths = url.split('/')
host = paths[2]
if '.' in host:
host_paths = host.split('.')
length = len(host_paths)
if length == 2:
host_paths[0] = '***'
else:
host_paths[1] = '***'
host = '.'.join(host_paths)
else:
host = '***'
paths[2] = host
masked = '/'.join(paths)
return masked
class RemoteTask:
def __init__(self, url=None, referer=None, cookies=None):
self.url = url
self.referer = referer
self.cookies = cookies
def __del__(self):
self.url = None
self.referer = None
self.cookies = None
del self.cookies
del self.referer
del self.url
def unspecific_ua():
return random.choice(user_agent_list)
get_res_objects()函数返回三个值(data, url, cookies),分别是HTML源码文本(或二进制)、当前请求URL和Cookies。
# -*- coding: utf-8 -*-
"""
@file: httpkit
@author: MR.N
@created: 2022/4/2 4月
@version: 1.0
@blog: https://blog.csdn.net/qq_21264377
"""
def get_res_objects(remote_task=None, dtype=0):
if remote_task is None:
return '', '', ''
if not isinstance(remote_task, RemoteTask):
return '', '', ''
url = remote_task.url
if not valid_https(url):
return '', '', ''
referer = remote_task.referer
cookies = remote_task.cookies
ua = unspecific_ua()
headers = {
'User-Agent': ua,
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Encoding': 'gzip, deflate',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Site': 'none',
'Upgrade-Insecure-Requests': '1',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive'}
if referer is None:
headers['Referer'] = url
else:
headers['Referer'] = referer
if cookies is not None:
headers['Cookie'] = cookies
socket.setdefaulttimeout(SOCKET_TIMEOUT)
ssl._create_default_https_context = ssl._create_unverified_context
request = urllib.request.Request(url, headers=headers)
cookie_kit = http.cookiejar.CookieJar()
cookie_handler = urllib.request.HTTPCookieProcessor(cookie_kit)
opener = urllib.request.build_opener(cookie_handler)
response = opener.open(request, timeout=HTTPS_TIMEOUT)
cookie_res = ''
for cookie in cookie_kit:
cookie_res += cookie.name + '=' + cookie.value + ';'
# print(f'cookies={cookie_res}')
if response.getcode() == 200:
data = response.read()
if dtype == 1:
return data, url, cookie_res
if data is not None:
content_encoding = response.getheader('Content-Encoding')
if content_encoding is not None and content_encoding.lower() in ['gzip', 'deflate']:
data = gzip.decompress(data).decode('UTF-8', errors='strict')
else:
data = data.decode('UTF-8', errors='strict')
return data, url, cookie_res
else:
return '', url, cookie_res
else:
return '', url, cookie_res
get_res_objects2()函数返回状态码,通过传入数组参数获得三个返回值(data, url, cookies),分别是HTML源码文本(或二进制)、当前请求URL和Cookies。
def get_res_objects2(remote_task=None, ret=[], dtype=0):
if remote_task is None:
ret += ['', '', '']
return 'err'
if not isinstance(remote_task, RemoteTask):
ret += ['', '', '']
return 'err'
url = remote_task.url
if not valid_https(url):
ret += ['', '', '']
return 'err'
referer = remote_task.referer
cookies = remote_task.cookies
ua = unspecific_ua()
headers = {
'User-Agent': ua,
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Encoding': 'gzip, deflate',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Site': 'none',
'Upgrade-Insecure-Requests': '1',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive'}
if referer is None:
headers['Referer'] = url
else:
headers['Referer'] = referer
if cookies is not None:
headers['Cookie'] = cookies
socket.setdefaulttimeout(SOCKET_TIMEOUT)
ssl._create_default_https_context = ssl._create_unverified_context
request = urllib.request.Request(url, headers=headers)
cookie_kit = http.cookiejar.CookieJar()
cookie_handler = urllib.request.HTTPCookieProcessor(cookie_kit)
opener = urllib.request.build_opener(cookie_handler)
response = opener.open(request, timeout=HTTPS_TIMEOUT)
cookie_res = ''
for cookie in cookie_kit:
cookie_res += cookie.name + '=' + cookie.value + ';'
# print(f'cookies={cookie_res}')
if response.getcode() == 200:
data = response.read()
if dtype == 1:
ret += [data, url, cookies]
return 'success'
if data is not None:
content_encoding = response.getheader('Content-Encoding')
if content_encoding is not None and content_encoding.lower() in ['gzip', 'deflate']:
data = gzip.decompress(data).decode('UTF-8', errors='strict')
else:
data = data.decode('UTF-8', errors='strict')
ret += [data, url, cookies]
return 'success'
else:
ret += ['', '', '']
return 'err'
else:
ret += ['', '', '']
return 'err'