https://node-crawler.readthedocs.io/zh_CN/latest/
爬虫无非就是分析网页, 分析接口, 取得你想要的数据, 取得数据有两种方式:
var express = require("express");
var router = express.Router();
router.get("/", function(req, res, next) {
// 设置请求头
res.header("Access-Control-Allow-Origin", "*");
res.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type");
res.header("Access-Control-Allow-Methods", "PUT, GET, POST, DELETE, OPTIONS");
// 设置类型,页数
var type = req.query.type;
var page = req.query.page;
type = type || "weixin";
page = page || "1";
// 引入依赖模块
var superagent = require("superagent"); // 用来发起请求的,是一个轻量的,渐进式的ajax api
var charset = require("superagent-charset"); // 防止爬取下来的数据乱码,更改字符格式
var cheerio = require("cheerio"); // 为服务器特别定制的,快速、灵活、实施的jQuery核心实现
// 设置请求地址
var baseUrl = "https://www.qqtn.com/";
var route = "tx/" + type + "tx_" + page + ".html";
charset(superagent);
superagent
.get(baseUrl + route)
.charset("gb2312")
.end(function(err, sres) {
var items = [];
if (err) {
console.log("err", err);
res.json({
code: 400,
msg: "err",
data: items
});
return;
}
// 获取网页源代码
var $ = cheerio.load(sres.text);
// 使用jquery 获取到标签,属性
$("div.g-main-bg ul.g-select-img li a").each(function(idx, element) {
var $element = $(element);
var $subElement = $element.find("img");
var $thumbImgSrc = $subElement.attr("src");
items.push({
title: $element.attr("title"),
href: $element.attr("href"),
thumbSrc: $thumbImgSrc
});
});
res.json({
code: 200,
msg: "success",
data: items
});
});
});
module.exports = router;