项目组有个网络爬虫的需求,Java有个爬页面的Jsoup,虽然可以一定程度上借助cookie,sessionId,TCP/IP通信等相关实现一定程度上的模拟浏览器行为,但作为服务器开发语言,在前端交互上,显然不能挥洒自如.老大推荐了CasperJs,同事在搞,顺便也偷喵的学习下.
CasperJs 是一个开源的导航脚本处理和测试工具,基于PhantomJS(前端自动化测试工具)编写.CasperJS简化了完整的导航场景的过程定义.提供了用于完成常见任务的实用的高级函数,方法和语法.
Phantomjs 一个基于webkit内核的无头浏览器,提供Headless Testing,Screen Capture,Page Automation,Network Monitoringj 解决方案.
Phantomjs 提供javascript API,通过js就可以实现与Phantomjs交互,而CasperJs 用js开发的脚本工具,其功能的实现依赖Phantomjs API.因此利用Caperjs 进行爬虫,其核心还是Phantomjs.
var casper = require('casper').create();
casper.start('http://casperjs.org/');
casper.then(function() {
this.echo('First Page: ' + this.getTitle());
});
casper.thenOpen('http://phantomjs.org', function() {
this.echo('Second Page: ' + this.getTitle());
});
casper.run();
在sample.js的目录下执行cmd命令 casperjs sample.js,即可看到返回值
First Page: CasperJS - a navigation scripting & testing utility for PhantomJS and SlimerJS written in Javascript
Second Page: PhantomJS | PhantomJS
至此,安装成功.
phantom.outputEncoding = "GBK"; //设置编码
var fs = require('fs'); //来自phantomjs model
var utils = require('utils'); //casperjs 自身的model
var casper = require('casper').create({
verbose: true,
silentErrors: true,
//logLevel: "info", //日志级别
viewportSize: {width: 800, height: 600},
clientScripts: [//这里可以随意设置你的自定义js文件
'jquery-3.2.1.min.js' //可在evaluate 方法中使用(注入到page中)
//'polyfill.min.js'
],
pageSettings: {
loadImages: true, //加载图片
loadPlugins: true, //插件
XSSAuditingEnabled: false,
localToRemoteUrlAccessEnabled: false,
//userAgent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' //伪造头
userAgent: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0' //伪造头
},
onError: function (casper, msg, backtrace) {
this.echo("onError msg:", msg);
backtrace.forEach(function (i, index) {
this.echo("onError backtrace:", msg);
});
},
onLoadError: function (casper, requestUrl, sta) {
casper.echo("onLoadError requestUrl: " + requestUrl);
casper.echo("onLoadError status :" + sta);
},
onResourceRequested: function (casper, resource,request) { //处理请求,如不加载一些adobe无关url
var skip = [
'adobe.com',
'web.yixin'
];
skip.forEach(function (needle) {
if (resource.url.indexOf(needle) > 0) {
casper.echo('abort: ' + needle);
request.abort();
}
})
}
});
var email = "****";//163邮箱
var password = "*****";//密码
var sendEmail = "****";
//var page = webPage.create(); //在casperjs启动后,自动创建一个page 可以使用casper.page获取(options 中有page 默认为空)
casper.start("http://mail.163.com/", function () {
this.echo("loaded : " + new Date());
//print console.log
this.page.onConsoleMessage = function (e) {
console.log(e);
}
});
casper.waitForSelector("#x-URS-iframe", function () {
//登录iframe,指的是#x-URS-iframe
this.switchToFrame(3); //进入登录的iframe
this.echo("frameName "+ this.page.frameName);
this.waitForSelector("form#login-form", function () {
//填充登录表单
this.fill('form#login-form', {
'email': email,
'password': password,
}, false);
//点击登录按钮
this.click("#dologin");
this.wait(2000, function () {
//登录名
if (this.exists("span[id$='_dvGreetName']")) {
this.echo("login success username: " + this.fetchText("span[id$='_dvGreetName']"));
/* var flag=this.evaluate(function(){ //jquery用法,在phantomjs的方法中使用
console.log("login success username: " + $("span[id$='_dvGreetName']").html());
return true;
}); */
} else {
this.echo("login fail");
}
//点击写信
this.click("li.js-component-component.ra0.mD0 > span.oz0");
this.wait(2000, function () {
//以下测试写信,不过失败了
this.wait(2000, function () {
casper.waitForSelector("div[id^='_mail_toolbar_']", function () {
//进入写信页面
this.echo("access writer letter page");
this.wait(2000, function () {
this.capture("1.png");
this.sendKeys("div[id^='_mail_input_'] > input[id$='_subjectInput']", 'hello'); //发信主题
//填写发送邮箱的时候失败了,页面上填写完邮件,有绑定事件检测你填写的邮箱格式,然后包装你的邮箱(用em标签包装),但casperjs模拟此行为失败
this.sendKeys("input.nui-editableAddr-ipt", sendEmail); //发送qq邮箱
this.switchToFrame(4);
this.sendKeys("body.nui-scroll", '123');
this.switchToMainFrame();
//this.switchToParentFrame();
this.echo(this.exists("div[id^='_mail_input_'] > input[id$='_subjectInput']"));
this.echo(this.exists("input.nui-editableAddr-ipt"));
this.echo("frameName "+ this.page.frameName);
this.wait(3000, function () {
this.capture("2.png");
//点击发送按钮
this.click("div.nui-toolbar-item > div.js-component-button.nui-mainBtn.nui-btn.nui-btn-hasIcon.nui-mainBtn-hasIcon > span.nui-btn-text");
this.wait(3000, function () {
this.capture("3.png");
})
});
});
}, function () {
this.echo("access writer letter page fail").exit();
}, 10000);
});
});
})
}, function () {
this.echo("form#login-form not found").exit();
}, 10000);
});
casper.run(function () {
this.echo("end").exit();
});