当一个目标站加密字段特别多,而且我们对其并不需要大规模爬取时,考虑到时间成本,我们就可以使用RPC来获取加密数据。
RPC: 远程服务器调用,
原理简单的说就是 搭建一个后端服务器, 然后把拦截到的浏览器数据通过websocket发送到我们的服务器, 我们通过API 取获取加密数据。
这里就不多说了, 想详细了解的自行取goole吧!
今天我所带来的是一个简单的RPC模型, 架构很简单,主要分为三个部分,1、spider客户端 2、服务器 3、浏览器客户端, 其中服务器只做数据的中转,数据逻辑处理都做客户端。 工作流程如下:
以上就是整个RPC 获取加密数据的流程。下面是代码:
import asyncio
import websockets
# client connection container
client = [] # 连接池
async def send_to_all(message):
for i in client:
await i.send(message)
async def main_logic(websocket):
client.append(websocket)
while True:
try:
recv_text = await websocket.recv()
except Exception:
client.remove(websocket)
break
await send_to_all(recv_text)
async def main():
async with websockets.serve(main_logic, '127.0.0.1', 9999):
await asyncio.Future() # run forever
if __name__ == '__main__':
asyncio.run(main())
import asyncio
import websockets
import json
import time
"""
请求端 -- spider
"""
async def ws_client(url, n):
async with websockets.connect(url) as websocket:
data = {'id': '1', 'value': {'page': n, 'time': round(time.time() * 1000)}}
await websocket.send(json.dumps(data))
while True:
response = await websocket.recv()
data = json.loads(response)
if data.get('id') == '1':
continue
break
return response
if __name__ == "__main__":
data = asyncio.run(ws_client('ws://127.0.0.1:9999', 1))
print(data)
!function () {
// flagLX 防止浏览器不停的创建连接
if (window.flagLX) {}
else {
// sign 加密函数,自行更改
window.weiboLx = sign;
// host:port
var ws = new WebSocket("ws://127.0.0.1:9999");
// 建立连接后更新一下状态
window.flagLX = true;
ws.open = function (evt) {};
ws.onmessage = function (evt) {
// 如果返回值不是一个json格式可以去掉JSON.parse
var n = JSON.parse(evt.data);
if (n['id'] == '2') {
return
};
var data = n['value'];
// 数据加密
var res = window.weiboLx(data['page'] + '|' + data['time']);
// 加密后的数据发送到后端服务器
ws.send(JSON.stringify({
'id': '2',
'value': res
}));
}
}
}();
注意: 这段js代码需要自行定位加密函数位置,把代码加入其中。具体步骤:
使用案例:
猿人学的第二十题
server.py
import asyncio
import websockets
# client connection container
client = [] # 连接池
async def send_to_all(message):
for i in client:
await i.send(message)
async def main_logic(websocket):
client.append(websocket)
while True:
try:
recv_text = await websocket.recv()
except Exception:
client.remove(websocket)
break
await send_to_all(recv_text)
async def main():
async with websockets.serve(main_logic, '127.0.0.1', 9999):
await asyncio.Future() # run forever
if __name__ == '__main__':
asyncio.run(main())
main.py
import asyncio
import sys
import websockets
import json
import time
import requests
from submit import ExSubmit
"""
请求端 -- spider
"""
async def ws_client(url, page, time_str):
async with websockets.connect(url) as websocket:
data = {'id': '1', 'value': {'page': page, 'time': time_str}}
await websocket.send(json.dumps(data))
while True:
response = await websocket.recv()
data = json.loads(response)
if data.get('id') == '1':
continue
break
return response
if __name__ == "__main__":
url = "https://match.yuanrenxue.cn/api/match/20"
headers = {
'authority': "match.yuanrenxue.cn",
'accept': "application/json, text/javascript, */*; q=0.01",
'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
'cache-control': "no-cache,no-cache",
'cookie': "no-alert3=true; Hm_lvt_c99546cf032aaa5a679230de9a95c7db=1680341495,1680341600,1680417806,1680477550; Hm_lvt_9bcbda9cbf86757998a2339a0437208e=1680478049; Hm_lvt_434c501fe98c1a8ec74b813751d4e3e3=1680940092; m=5f464c4990212f648d29c9d6cb11f4bb; RM4hZBv0dDon443M=tadIJCdI390rg0XPWepU3ZtEXWceovCrsx6wwpLJyOsM5JfeYtUZgz4Vbkc99+Iie7BbXDLwRqk/SBFzU+zLn4fusno7SgY5YhUACOABKh2lpgrQL8w7SBRZryxDJ8fQg5yBnFC9YvFnZXEu5UrGEswAab6ux00gSyLWf78J2JfajtgCBKCSKvRB5p4JcfDKSXNnY/wTCs4SBgNc5OYXCh6I99uZ06CPGg096PEzlF8=; Hm_lpvt_434c501fe98c1a8ec74b813751d4e3e3=1680945275; tk=4599678554058727860; sessionid=kbn1zg4itb0fxx37mulvw6tzwwmm1zyv; Hm_lpvt_9bcbda9cbf86757998a2339a0437208e=1681515897; Hm_lpvt_c99546cf032aaa5a679230de9a95c7db=1681515965",
'pragma': "no-cache",
'referer': "https://match.yuanrenxue.cn/match/20",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
}
number = 0
for i in range(1, 6):
time_str = round(time.time() * 1000)
data = asyncio.run(ws_client('ws://127.0.0.1:9999', i, time_str))
data_info = json.loads(data)
print(data_info)
querystring = {"page": i, "sign": data_info.get('value'), "t": f"{time_str}"}
if i >= 4:
headers['user-agent'] = 'yuanrenxue.project'
response = requests.request("GET", url, headers=headers, params=querystring)
try:
resp_data = response.json()
except Exception:
print(response.text)
sys.exit(1)
print(resp_data)
for _ in resp_data.get('data'):
number += _.get('value')
print(number)
# 我自己写的提交答案的方法,可以忽略
ExSubmit(number, 20, 'kbn1zg4itb0fxx37mulvw6tzwwmm1zyv')
web.js
!function () {
if (window.flagLX) {}
else {
window.weiboLx = sign;
var ws = new WebSocket("ws://127.0.0.1:9999");
window.flagLX = true;
ws.open = function (evt) {};
ws.onmessage = function (evt) {
// 如果返回值不是一个json格式可以去掉JSON.parse
var n = JSON.parse(evt.data);
if (n['id'] == '2') {
return
};
var data = n['value'];
var res = window.weiboLx(data['page'] + '|' + data['time']);
ws.send(JSON.stringify({
'id': '2',
'value': res
}));
}
}
}();
这里给新手提示一下, 启动顺序: