当前位置: 首页 > 工具软件 > cheerio > 使用案例 >

用nodejs的request和cheerio模块爬取网页和图片

燕雨石
2023-12-01

话不多说,直接上代码,代码目前只做到把图片src解析出来,并没有实现真正的下载,其中图片解析单独做出一个函数parseImageUrl,运行后,首先获取网页列表,然后每隔5秒获取列表中一个页面,同时获取页面中的所有图片

var request = require('request');
var cheerio = require('cheerio');

var artlist = [];
var prefixurl = "https://www.jb51.net";
var url = prefixurl + "/list/list_15_1.htm";

request({
    url: url,
    method: "GET",
}, (errRequest, resultStatus, htmlstr) => {
    if (!errRequest && resultStatus.statusCode == 200) {
        var $ = cheerio.load(htmlstr);
        $("div.artlist").find("dt > a").each(function(index, element) {
            var arc = {};
            arc.href = prefixurl + element.attribs["href"];
            arc.text = $(element).text();
            artlist.push(arc);
        });

        // console.log(artlist);
        getContent(artlist);

    } else {
        console.log("err-1", errRequest);
    }
});

async function getContent(artlist) {

    for (let i = 0; i < artlist.length; i++) {

        await sleep(5000);

        console.log("正在处理:" + (i + 1) + "/" + artlist.length +"\t", artlist[i].href, "\t", artlist[i].text);
        request({
            url: artlist[i].href,
            method: "GET"
        }, (errRequest, resultStatus, htmlstr) => {
            if (!errRequest && resultStatus.statusCode == 200) {
                var $ = cheerio.load(htmlstr);
                artlist[i].data = {};

                // 文章标题
                artlist[i].data.title = $("#article > h1").text();

                // 文章内容
                $("#content .art_xg").prev("p").remove();
                $("#content .art_xg").remove();
                artlist[i].data.content = $("#content").html();

                // 文章图片
                var imagelist = [];
                $("#content img").each((index, element) => {
                    var src = $(element).attr("src");
                    imagelist.push(parseImageUrl(src, prefixurl, url));
                });
                
                // console.log(artlist[i]);
                console.log(imagelist);
            } else {
                console.log("err-2", i, errRequest);
            }
        });
        
    }
    
}

function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

/*
var prefix = "https://www.baidu.com";
var url1 = "/pathname/1";
var url2 = "/pathname/1?params=aaa/bbb&others";

var src1 = "https://www.baidu.com/pathname/xxx/sss.jpg?others/something";
var src2 = "xxx/sss.jpg?others/something";
var src3 = "/xxx/sss.jpg?others/something";
var src4 = "//xxx/sss.jpg?others/something";

console.log(parseImageUrl(src4, prefix, url2));
*/
function parseImageUrl(src, prefix, url) {

    if (prefix.charAt(prefix.length) == "/") {
        prefix = prefix.substring(0, prefix.length - 1);
    }

    var imageurl = "";
    try {

        // src1
        let srcObj = new URL(src);
        imageurl = srcObj.href;

    } catch (err) {
        if (src.charAt(0) != "/") {

            // src2
            if (url.lastIndexOf("?") == -1) {

                // url1 - src2
                imageurl = prefix + url.substring(0, (url.lastIndexOf("/") == -1 ? url.length : url.lastIndexOf("/"))) + "/" + src;

            } else {

                // url2 - src2
                let urlFixed = url.substring(0, url.indexOf("?"));
                imageurl = prefix + urlFixed.substring(0, (urlFixed.lastIndexOf("/") == -1 ? urlFixed.length : urlFixed.lastIndexOf("/"))) + "/" + src;

            }


        } else if (src.charAt(0) == "/" && src.charAt(1) != "/") {

            // src3
            imageurl = prefix + src;

        } else if (src.charAt(0) == "/" && src.charAt(1) == "/") {

            // src4
            prefix = prefix.substring(0, prefix.indexOf("://"));
            imageurl = prefix + ":" + src;

        }
    }

    return imageurl;
}

 类似资料: