通过Selenium和代理已经可以很顺畅地拿到网页页面上可以看到的内容了。某些网站采用了Ajax技术,就我遇到的一个问题来说吧,在爬某网站的某个主页面的时候,它使用Ajax动态更新翻页内容,URL并没有变,这时候如果直接按URL发请求过去请求到的始终都是第一页的信息(如果有大神看见拜托指点一下),加上我发现它其实还有很多有用的信息并没有在页面直接显示,所以就有了能不能不通过页面直接把信息下载下来的想法。
下面是我参考一些资料,写的一个例子,具体实现的是获取名字为Ajax.json的请求,以.txt下载至指定路径,以返回json中"content"中“pageNo"命名下载文件。
代码:
1.manifest.json
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
2.backgrond.js
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "http",
host: "代理host",
port: parseInt(代理端口)
},
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "username",
password: "password"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
3.index.js
// chrome extension中不能使用console.log
// 所以,需要通过发送请求给后台脚本的方式来打印日志
const log = (...args) => chrome.extension.sendRequest({
tabId: chrome.devtools.tabId,
args,
});
// 注册回调,每一个http请求响应后,都触发该回调
chrome.devtools.network.onRequestFinished.addListener(async (...args) => {
try {
const [{
// 请求的类型,查询参数,以及url
request: { method, queryString, url },
// 该方法可用于获取响应体
getContent,
}] = args;
//log(method, queryString, url);
if (url.indexOf("Ajax.json")>-1) {
//log("1111111111111111111111111111111111111111");
// 将callback转为await promise
// warn: content在getContent回调函数中,而不是getContent的返回值
const content = await new Promise((res, rej) => getContent(res));
//log(content);
let js=JSON.parse(content);
//log(typeof js,typeof js["content"]);
//保存为txt
doSave(content, "text/latex", js["content"]["pageNo"].toString()+".txt");
}
} catch (err) {
log(err.stack || err.toString());
}
});
function doSave(value, type, name) {
var blob;
if (typeof window.Blob == "function") {
blob = new Blob([value], {type: type});
} else {
var BlobBuilder = window.BlobBuilder || window.MozBlobBuilder || window.WebKitBlobBuilder || window.MSBlobBuilder;
var bb = new BlobBuilder();
bb.append(value);
blob = bb.getBlob(type);
}
var URL = window.URL || window.webkitURL;
var bloburl = URL.createObjectURL(blob);
var anchor = document.createElement("a");
if ('download' in anchor) {
anchor.style.visibility = "hidden";
anchor.href = bloburl;
anchor.download = name;
document.body.appendChild(anchor);
var evt = document.createEvent("MouseEvents");
evt.initEvent("click", true, true);
anchor.dispatchEvent(evt);
document.body.removeChild(anchor);
window.URL.revokeObjectURL(anchor.href);
} else if (navigator.msSaveBlob) {
navigator.msSaveBlob(blob, name);
} else {
location.href = bloburl;
}
}
4.index.html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>Document</title>
<script src="./index.js"></script>
<script src="./background.js"></script>
</head>
<body>
</body>
</html>
5.压缩,ajax.zip
downloadDir, _ = os.Getwd()
downloadPath := downloadDir + "\\downloads\\" //注意downloadPath写法
func OpenChrome(downloadPath string) (wd selenium.WebDriver, err error) {
for {
time.Sleep(time.Second)
//打开 chrome 浏览器
caps := selenium.Capabilities{"browserName": "chrome"}
//禁止图片加载,加快渲染速度
imagCaps := map[string]interface{}{
"profile.managed_default_content_settings.images": 2,
"download.default_directory": downloadPath,
"history.deleting_enabled": true,
}
//rand.Seed(time.Now().Unix())
//proxyIndex := rand.Intn(len(proxyIps))
chromeCaps := chrome.Capabilities{
Prefs: imagCaps,
Path: "",
Args: []string{
//"--headless",
"--start-maximized",
"--window-size=12,12",
"--no-sandbox",
"--user-agent=" + utils.GetRandomUserAgent(),
"--disable-gpu",
"--disable-impl-side-painting",
"--disable-gpu-sandbox",
"--disable-accelerated-2d-canvas",
"--disable-accelerated-jpeg-decoding",
"--test-type=ui",
"--auto-open-devtools-for-tabs", //打开F12
//"-proxy-server="+proxyStr, //+ proxyPool.GetIp(),
},
}
//以上是设置浏览器参数
err = chromeCaps.AddExtension("other/proxy.zip")
if err != nil {
log.Println(err)
//关闭换个Ip再开
if wd != nil {
wd.Quit()
}
rand.Seed(time.Now().UnixNano())
time.Sleep(time.Second * time.Duration(rand.Intn(5)+1))
continue
}
err = chromeCaps.AddExtension("other/ajax.zip")
if err != nil {
log.Println("ajax:", err)
//关闭换个Ip再开
if wd != nil {
wd.Quit()
}
rand.Seed(time.Now().UnixNano())
time.Sleep(time.Second * time.Duration(rand.Intn(5)+1))
continue
}
caps.AddChrome(chromeCaps)
//打开 chrome 浏览器
wd, err = selenium.NewRemote(caps, fmt.Sprintf("http://localhost:%d/wd/hub", port))
if err != nil {
log.Println(err)
//关闭换个Ip再开
if wd != nil {
wd.Quit()
}
rand.Seed(time.Now().UnixNano())
time.Sleep(time.Second * time.Duration(rand.Intn(5)+1))
continue
}
break
}
return
}
*注意
1.downloadPath写法,windows的绝对路径。
2.打开F12模式才能下载,即args添加"–auto-open-devtools-for-tabs"。
3.注意判断一些文件是否下载成功,谷歌浏览器下载临时文件格式为.crx。