当前位置: 首页 > 工具软件 > NSoup > 使用案例 >

c# 使用HTML解析器NSoup爬取小说

管炳
2023-12-01

闲着没事,想试一试爬取一些小说,看了下园子里很多前辈写得一些文章很受启发。

说下我的思路:查看文章网页链接---->后台远程抓取到Html代码---->分析所需数据结构----->提取所需信息 

在这其中则免不了对html的一些操作。

方法很多种,具体移步前辈文章:https://www.cnblogs.com/cang12138/p/7464226.html?utm_source=debugrun&utm_medium=referral

在这里我贴出我自己测试过的代码,以此记录一下

using NSoup;
using NSoup.Nodes;
using NSoup.Select;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web.Mvc;

namespace PaChongDemo.Controllers
{
    public class HomeController : Controller
    {
        //定义要爬取网站的网址集合
        string[] urlArray = new string[] { "https://www.ddxsku.com/files/article/html/23/23024/index.html", "https://www.ddxsku.com/files/article/html/2/2739/index.html" };

        public ActionResult Index()
        {

            foreach (var item in urlArray)
            {
                NSoup(item);
            }
            return View();
        }

        /// <summary>
        /// 访问数据
        /// </summary>
        /// <param name="Url"></param>
        /// <param name="postDataStr"></param>
        /// <returns></returns>
        public string HttpGet(string Url, string postDataStr)
        {
            // HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
            // request.Method = "GET";
            // request.CookieContainer = new CookieContainer();
            // request.Accept = "*/*";
             request.ServicePoint.Expect100Continue = false;
            // //request.Timeout = 30000;
            // 设置连接超时时间 
            // //request.Headers.Set("Pragma", "no-cache");
            // request.UserAgent = "Mozilla-Firefox-Spider(Wenanry)";
            // request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");

            // HttpWebResponse response;
            // request.ContentType = "text/html;charset=UTF-8";
            // try
            // {
            //     response = (HttpWebResponse)request.GetResponse();
            // }
            // catch (WebException ex)
            // {
            //     response = (HttpWebResponse)request.GetResponse();
            // }
            // Stream myResponseStream = response.GetResponseStream();
            // StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
            // string retString = myStreamReader.ReadToEnd();
            // myStreamReader.Close();
            // myResponseStream.Close();
            // return retString;

            CookieContainer cookie = new CookieContainer();
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
            request.Method = "POST";
            request.ContentType = "application/x-www-form-urlencoded";
            request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
            request.CookieContainer = cookie;
            Stream myRequestStream = request.GetRequestStream();
            StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
            myStreamWriter.Write(postDataStr);
            myStreamWriter.Close();

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            response.Cookies = cookie.GetCookies(response.ResponseUri);
            Stream myResponseStream = response.GetResponseStream();
            StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
            string retString = myStreamReader.ReadToEnd();
            myStreamReader.Close();
            myResponseStream.Close();

            return retString;
        }

        /// <summary>
        /// 创建文本
        /// </summary>
        /// <param name="content">内容</param>
        /// <param name="name">名字</param>
        /// <param name="path">路径</param>
        public void Novel(string content, string name, string path)
        {

            string Log = content + "\r\n";
            //创建文件夹,如果不存在就创建file文件夹
            if (Directory.Exists(path) == false)
            {
                Directory.CreateDirectory(path);
            }

            //判断文件是否存在,不存在则创建
            if (!System.IO.File.Exists(path + '/' + name + ".txt"))
            {
                FileStream fs1 = new FileStream(path + '/' + name + ".txt", FileMode.Create, FileAccess.Write);//创建写入文件 
                StreamWriter sw = new StreamWriter(fs1);
                sw.WriteLine(Log);//开始写入值
                sw.Close();
                fs1.Close();
            }
            else
            {
                FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
                StreamWriter sr = new StreamWriter(fs);
                sr.WriteLine(Log);//开始写入值
                sr.Close();
                fs.Close();
            }
        }


        /// <summary>
        /// 使用HtmlagilityPack方式解析
        /// </summary>
        /// <param name="Url"></param>
        //public void HtmlagilityPack(string Url = "")
        //{
        //    HtmlWeb webClient = new HtmlWeb();
        //    webClient.OverrideEncoding = Encoding.GetEncoding("utf-8");//编码,这里网上有些很多写法都不正确
        //    HtmlDocument doc = webClient.Load(Url);
        //    HtmlNodeCollection anchors = doc.DocumentNode.SelectNodes("//class[@article_texttitleb]");

        //    string sss = "";
        //    foreach (var htmlNode in anchors)
        //    {
        //        int indexnum = anchors.IndexOf(htmlNode);
        //        sss += htmlNode.InnerHtml;
        //    }
        //}

        /// <summary>
        /// 使用HTML解析器NSoup方式解析
        /// </summary>
        /// <param name="Url"></param>
        public void NSoup(string Url = "")
        {
            Document doc = NSoupClient.Connect(Url).Get();
            Elements titles = doc.GetElementsByTag("title");//获取题目
            string path = Server.MapPath("/Content/" + titles.Text + "");
            Elements cataLog = doc.GetElementsByClass("at");//获取 目录
            Document docChild = NSoupClient.Parse(cataLog.ToString());
            Elements eleChild = docChild.GetElementsByTag("a");//查找a标签


            foreach (var item in eleChild)
            {
                string tile = item.Text();//获取章节标题
                string htmlChildUrl = item.Attr("href").ToString().Trim();
                Document docTwo = NSoupClient.Connect(htmlChildUrl).Get();
                Element conTent = docTwo.GetElementById("contents");
                string txtContent = conTent.Text();

                Novel(txtContent,KillBadChar(tile), path);
            }
        }



        /// <summary>
        /// 去掉特殊字符  避免题目报错
        /// </summary>
        /// <param name="charStr"></param>
        /// <returns></returns>
        public string KillBadChar(string charStr)
        {
            string reg = @"\:" + @"|\;" + @"|\/" + @"|\\" + @"|\|" + @"|\," + @"|\*" + @"|\?" + @"|\""" + @"|\<" + @"|\>";//特殊字符
            Regex r = new Regex(reg);
            return r.Replace(charStr, "");//将特殊字符替换为""
        }
    }
}

 

 类似资料: