话不多说,直接上代码,代码目前只做到把图片src解析出来,并没有实现真正的下载,其中图片解析单独做出一个函数parseImageUrl,运行后,首先获取网页列表,然后每隔5秒获取列表中一个页面,同时获取页面中的所有图片
var request = require('request');
var cheerio = require('cheerio');
var artlist = [];
var prefixurl = "https://www.jb51.net";
var url = prefixurl + "/list/list_15_1.htm";
request({
url: url,
method: "GET",
}, (errRequest, resultStatus, htmlstr) => {
if (!errRequest && resultStatus.statusCode == 200) {
var $ = cheerio.load(htmlstr);
$("div.artlist").find("dt > a").each(function(index, element) {
var arc = {};
arc.href = prefixurl + element.attribs["href"];
arc.text = $(element).text();
artlist.push(arc);
});
// console.log(artlist);
getContent(artlist);
} else {
console.log("err-1", errRequest);
}
});
async function getContent(artlist) {
for (let i = 0; i < artlist.length; i++) {
await sleep(5000);
console.log("正在处理:" + (i + 1) + "/" + artlist.length +"\t", artlist[i].href, "\t", artlist[i].text);
request({
url: artlist[i].href,
method: "GET"
}, (errRequest, resultStatus, htmlstr) => {
if (!errRequest && resultStatus.statusCode == 200) {
var $ = cheerio.load(htmlstr);
artlist[i].data = {};
// 文章标题
artlist[i].data.title = $("#article > h1").text();
// 文章内容
$("#content .art_xg").prev("p").remove();
$("#content .art_xg").remove();
artlist[i].data.content = $("#content").html();
// 文章图片
var imagelist = [];
$("#content img").each((index, element) => {
var src = $(element).attr("src");
imagelist.push(parseImageUrl(src, prefixurl, url));
});
// console.log(artlist[i]);
console.log(imagelist);
} else {
console.log("err-2", i, errRequest);
}
});
}
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/*
var prefix = "https://www.baidu.com";
var url1 = "/pathname/1";
var url2 = "/pathname/1?params=aaa/bbb&others";
var src1 = "https://www.baidu.com/pathname/xxx/sss.jpg?others/something";
var src2 = "xxx/sss.jpg?others/something";
var src3 = "/xxx/sss.jpg?others/something";
var src4 = "//xxx/sss.jpg?others/something";
console.log(parseImageUrl(src4, prefix, url2));
*/
function parseImageUrl(src, prefix, url) {
if (prefix.charAt(prefix.length) == "/") {
prefix = prefix.substring(0, prefix.length - 1);
}
var imageurl = "";
try {
// src1
let srcObj = new URL(src);
imageurl = srcObj.href;
} catch (err) {
if (src.charAt(0) != "/") {
// src2
if (url.lastIndexOf("?") == -1) {
// url1 - src2
imageurl = prefix + url.substring(0, (url.lastIndexOf("/") == -1 ? url.length : url.lastIndexOf("/"))) + "/" + src;
} else {
// url2 - src2
let urlFixed = url.substring(0, url.indexOf("?"));
imageurl = prefix + urlFixed.substring(0, (urlFixed.lastIndexOf("/") == -1 ? urlFixed.length : urlFixed.lastIndexOf("/"))) + "/" + src;
}
} else if (src.charAt(0) == "/" && src.charAt(1) != "/") {
// src3
imageurl = prefix + src;
} else if (src.charAt(0) == "/" && src.charAt(1) == "/") {
// src4
prefix = prefix.substring(0, prefix.indexOf("://"));
imageurl = prefix + ":" + src;
}
}
return imageurl;
}