闲着没事,想试一试爬取一些小说,看了下园子里很多前辈写得一些文章很受启发。
说下我的思路:查看文章网页链接---->后台远程抓取到Html代码---->分析所需数据结构----->提取所需信息
在这其中则免不了对html的一些操作。
方法很多种,具体移步前辈文章:https://www.cnblogs.com/cang12138/p/7464226.html?utm_source=debugrun&utm_medium=referral
在这里我贴出我自己测试过的代码,以此记录一下
using NSoup;
using NSoup.Nodes;
using NSoup.Select;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web.Mvc;
namespace PaChongDemo.Controllers
{
public class HomeController : Controller
{
//定义要爬取网站的网址集合
string[] urlArray = new string[] { "https://www.ddxsku.com/files/article/html/23/23024/index.html", "https://www.ddxsku.com/files/article/html/2/2739/index.html" };
public ActionResult Index()
{
foreach (var item in urlArray)
{
NSoup(item);
}
return View();
}
/// <summary>
/// 访问数据
/// </summary>
/// <param name="Url"></param>
/// <param name="postDataStr"></param>
/// <returns></returns>
public string HttpGet(string Url, string postDataStr)
{
// HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
// request.Method = "GET";
// request.CookieContainer = new CookieContainer();
// request.Accept = "*/*";
request.ServicePoint.Expect100Continue = false;
// //request.Timeout = 30000;
// 设置连接超时时间
// //request.Headers.Set("Pragma", "no-cache");
// request.UserAgent = "Mozilla-Firefox-Spider(Wenanry)";
// request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
// HttpWebResponse response;
// request.ContentType = "text/html;charset=UTF-8";
// try
// {
// response = (HttpWebResponse)request.GetResponse();
// }
// catch (WebException ex)
// {
// response = (HttpWebResponse)request.GetResponse();
// }
// Stream myResponseStream = response.GetResponseStream();
// StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
// string retString = myStreamReader.ReadToEnd();
// myStreamReader.Close();
// myResponseStream.Close();
// return retString;
CookieContainer cookie = new CookieContainer();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
request.CookieContainer = cookie;
Stream myRequestStream = request.GetRequestStream();
StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
myStreamWriter.Write(postDataStr);
myStreamWriter.Close();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
response.Cookies = cookie.GetCookies(response.ResponseUri);
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
/// <summary>
/// 创建文本
/// </summary>
/// <param name="content">内容</param>
/// <param name="name">名字</param>
/// <param name="path">路径</param>
public void Novel(string content, string name, string path)
{
string Log = content + "\r\n";
//创建文件夹,如果不存在就创建file文件夹
if (Directory.Exists(path) == false)
{
Directory.CreateDirectory(path);
}
//判断文件是否存在,不存在则创建
if (!System.IO.File.Exists(path + '/' + name + ".txt"))
{
FileStream fs1 = new FileStream(path + '/' + name + ".txt", FileMode.Create, FileAccess.Write);//创建写入文件
StreamWriter sw = new StreamWriter(fs1);
sw.WriteLine(Log);//开始写入值
sw.Close();
fs1.Close();
}
else
{
FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
StreamWriter sr = new StreamWriter(fs);
sr.WriteLine(Log);//开始写入值
sr.Close();
fs.Close();
}
}
/// <summary>
/// 使用HtmlagilityPack方式解析
/// </summary>
/// <param name="Url"></param>
//public void HtmlagilityPack(string Url = "")
//{
// HtmlWeb webClient = new HtmlWeb();
// webClient.OverrideEncoding = Encoding.GetEncoding("utf-8");//编码,这里网上有些很多写法都不正确
// HtmlDocument doc = webClient.Load(Url);
// HtmlNodeCollection anchors = doc.DocumentNode.SelectNodes("//class[@article_texttitleb]");
// string sss = "";
// foreach (var htmlNode in anchors)
// {
// int indexnum = anchors.IndexOf(htmlNode);
// sss += htmlNode.InnerHtml;
// }
//}
/// <summary>
/// 使用HTML解析器NSoup方式解析
/// </summary>
/// <param name="Url"></param>
public void NSoup(string Url = "")
{
Document doc = NSoupClient.Connect(Url).Get();
Elements titles = doc.GetElementsByTag("title");//获取题目
string path = Server.MapPath("/Content/" + titles.Text + "");
Elements cataLog = doc.GetElementsByClass("at");//获取 目录
Document docChild = NSoupClient.Parse(cataLog.ToString());
Elements eleChild = docChild.GetElementsByTag("a");//查找a标签
foreach (var item in eleChild)
{
string tile = item.Text();//获取章节标题
string htmlChildUrl = item.Attr("href").ToString().Trim();
Document docTwo = NSoupClient.Connect(htmlChildUrl).Get();
Element conTent = docTwo.GetElementById("contents");
string txtContent = conTent.Text();
Novel(txtContent,KillBadChar(tile), path);
}
}
/// <summary>
/// 去掉特殊字符 避免题目报错
/// </summary>
/// <param name="charStr"></param>
/// <returns></returns>
public string KillBadChar(string charStr)
{
string reg = @"\:" + @"|\;" + @"|\/" + @"|\\" + @"|\|" + @"|\," + @"|\*" + @"|\?" + @"|\""" + @"|\<" + @"|\>";//特殊字符
Regex r = new Regex(reg);
return r.Replace(charStr, "");//将特殊字符替换为""
}
}
}