【原创.NET】庆祝.NET 7.0正式发布,用C#编写个小说网站通用爬虫程序
本帖最后由 jdclang 于 2022-11-11 12:58 编辑之前一直用Python写爬虫,Python写爬虫也确实方便,不过实在不喜欢,甚至讨厌Python在任务并发上的模式,这次疫情被封控在家,加上.NET 7.0发布,顺手用.NET 7写了个所谓的小说网站通用爬虫(在初始化类的时候需要自己提供不同网站的XPATH),跟Python的代码量比较了一下,其实也没多多少内容。唯一没Python方便的就是HtmlAgilityPack没有BeautifulSoup方便,只支持XPATH方式,不过其实XPATH也挺方便的;www。
代码看着多,其实里面注释差不多占了不少内容,方便各位理解,本来还想加上自动获取代{过}{滤}理池的,不过后来想想,国内盗版网站好像都没防爬虫机制,就懒得折腾了,有兴趣的自己修改吧。
using System.Diagnostics.CodeAnalysis;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using NLog;
using NLog.Targets;
namespace StoryBookSpyder;
public class SpyderEngine
{
#region 初始化变量
//初始化日志工具
private static Logger _logger = LogManager.GetCurrentClassLogger();
//定义小说属性
private string _bookName; //书名
private string _encoding; //网页字符编码
private string _baseAddress; //小说首页
private string _url; //章节地址
public int _Max_Concurrency { get; set; } = 50;
//设置爬取内容的Xpath路径
/// <summary>
/// 小说标题
/// </summary>
public string _Xpath_Title { get; set; } = "/html/body/div/h1/a";
/// <summary>
/// 小说章节
/// </summary>
public string _Xpath_Content { get; set; } = "//*[@class='chapter']/li";
/// <summary>
/// 章节下一页
/// </summary>
public string _Xpath_Nextpage { get; set; } = "(//*[@class='page'])/a";
/// <summary>
/// 章节内容
/// </summary>
public string _Xpath_ChapterContent { get; set; } = "(//*[@class='nr_nr'])/div";
/// <summary>
/// 章节内容下一页
/// </summary>
public string _Xpath_ChapterContent_Nextpage { get; set; } = "(//td[@class='next']//a)";
//初始化章节列表
private List<Catalogue> bookCatalogues = new();
static HttpClientHandler handler = new();
internal static readonly HttpClient httpClient = new(handler);
#endregion
#region 初始化爬虫引擎
/// <summary>
/// 初始化爬虫引擎,设置网站的默认地址,如果小说页面地址不完整,则自动根据默认地址补全
/// </summary>
/// <param name="baseAddress">网站地址</param>
/// <param name="url">小说页面地址</param>
/// <param name="encoding">网页字符编码</param>
internal SpyderEngine(string baseAddress, string encoding, string? xpath_Title, string? xpath_content,
string? xpath_nextpage)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
handler.ServerCertificateCustomValidationCallback =
HttpClientHandler.DangerousAcceptAnyServerCertificateValidator;
_baseAddress = baseAddress;
httpClient.BaseAddress = new Uri(_baseAddress);
_encoding = encoding;
if (xpath_Title is not null)
{
_Xpath_Title = xpath_Title;
}
if (xpath_content is not null)
{
_Xpath_Content = xpath_content;
}
if (xpath_nextpage is not null)
{
_Xpath_Nextpage = xpath_nextpage;
}
httpClient.DefaultRequestHeaders.Clear();
httpClient.DefaultRequestHeaders.Add("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
httpClient.DefaultRequestHeaders.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
httpClient.DefaultRequestHeaders.Add("Cache-Control", "max-age=0");
httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive");
httpClient.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1");
httpClient.DefaultRequestHeaders.Add("User-Agent",
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36 Edg/107.0.1418.35");
}
#endregion
#region 开始爬取小说
/// <summary>
/// 开始执行爬虫
/// </summary>
/// <param name="url">含有章节列表的网页地址</param>
public async Task SpyderBook(string url)
{
await GetAllCatalogue(url);
// 输出已存储的章节列表
foreach (var bookCatalogue in bookCatalogues)
{
Console.WriteLine($"章节名:{bookCatalogue._title},章节地址:{bookCatalogue._catalogueUrl}");
}
//根据章节列表抓取章节内容,单线程
// foreach (var bookCatalogue in bookCatalogues)
// {
// var catalogue = await GetChapterContents(bookCatalogue._catalogueUrl);
// bookCatalogue._content = catalogue;
// }
//根据章节列表抓取章节内容,多线程
Parallel.ForEach(bookCatalogues, new ParallelOptions() {MaxDegreeOfParallelism = _Max_Concurrency},
(_catalogue, loopState) =>
{
try
{
var catalogue = GetChapterContents(_catalogue._catalogueUrl).Result;
_catalogue._content = catalogue;
}
catch (Exception ex)
{
Console.Error.WriteLine("出错了");
Console.ReadLine();
_logger.Error($"出问题了,{ex.Message},出问题的章节是:{_catalogue._catalogueUrl}");
}
});
string result = String.Empty;
foreach (var book in bookCatalogues)
{
// Console.WriteLine($"{book._title}---{book._content}");
result +="\r\n"+ book._title + "\r\n" + book._content;
}
// 处理小说文字中的垃圾内容,根据自己需求实际情况在自己调整
Regex regex = new Regex("(?<=「).*?(?=」)");
result = RemoveFragments.RemoveFragmentsBetween(result, '(', ')')
.RemoveFragmentsBetween('(', ')')
.RemoveFragmentsBetween('【', '】')
;
var match = regex.Matches(result);
try
{
for (int i = 0; i < match.Count; i++)
{
result = result.Replace(match.Value, match.Value.Replace("<br>", ""));
}
}
catch(Exception e){}
result = result.Replace(" ", "")
.Replace("-->>", "")
.Replace("「「", "」「")
.Replace("」」", "」「")
.Replace("第一发布站:xxxx.coм", "")
.Replace("www.xxxxx.com收藏不迷路!","")
.Replace("发布地址:<ref=\"http:www.xxxxx.com\"target=\"_blank\">","")
.Replace("</ref=\"http:>","")
.Replace("<br><br><br>", "")
.Replace("<br><br>", "\r\n\r\n ")
.Replace("<br>", "")
;
Regex regex_again = new Regex("&.*?;");
result = regex_again.Replace(result, "");
Console.WriteLine(result);
Write2Txt(result, $"{AppDomain.CurrentDomain.BaseDirectory}{_bookName}\\{_bookName}.txt");
}
/// <summary>
/// 获取所有章节,写入章节列表
/// </summary>
/// <param name="url">包含章节列表的页面</param>
private async Task GetAllCatalogue(string url)
{
//读取制定网页html代码
var html = await GetHtmlAsync(url);
if (html is not null)
{
//读取小说名
_bookName = await GetBookName(html);
if (_bookName is not null)
{
//读取章节列表
GetBookCatalogue(html);
//判断是否有下一页
var nextpage = await GetNextPage(html);
//针对网站相同网页存在不同地址的处理
if (!nextpage.Equals(string.Empty) && nextpage != url.Replace(_baseAddress, "").Replace("/index.html", "_1/"))
{
_logger.Info($"找到下一页,地址{nextpage}");
await GetAllCatalogue(nextpage);
}
}
}
else
{
_logger.Error($"任务非正常终止,获取章节列表失败,无法访问{_url}");
return;
}
}
#endregion
#region 获取指定网页的html代码
/// <summary>
/// 获取指定网页的html代码
/// </summary>
/// <param name="url">目标网页url</param>
/// <returns></returns>
private async Task<string> GetHtmlAsync(string url)
{
int _retry = 5; //错误尝试次数
bool EOF = true; //错误尝试开关
string? pageAddress;
string? responseString = String.Empty;
if (string.IsNullOrEmpty(url))
{
pageAddress = _url;
}
else
{
pageAddress = url;
}
#region 使用HttpRequestMessage的坑,注意避坑
// 这里很坑,使用HttpRequestMessage不能重复提交访问,否则报错。所以放弃使用HttpRequestMessage
// HttpRequestMessage request =
// new HttpRequestMessage(HttpMethod.Get, pageAddress);
// request.Headers.Add("Accept",
// "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
// request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
// request.Headers.Add("Cache-Control", "max-age=0");
// request.Headers.Add("Connection", "keep-alive");
// // request.Headers.Add("Cookie",
// // "X_CACHE_KEY=b42c608e5a7d95dbcd5c1b890fbd5417; PHPSESSID=c13e4def29ec686f997e48184a2209cc");
// request.Headers.Add("Referer", request.RequestUri.ToString());
// request.Headers.Add("Upgrade-Insecure-Requests", "1");
// request.Headers.Add("User-Agent",
// "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Mobile Safari/537.36 Edg/107.0.1418.35");
#endregion
while (EOF)
{
HttpResponseMessage? response = null;
try
{
response = await httpClient.GetAsync(pageAddress);
var responseMessage = response.EnsureSuccessStatusCode();
if (responseMessage.IsSuccessStatusCode)
{
//转换编码格式,避免返回乱码
var responseBody = await response.Content.ReadAsByteArrayAsync();
responseString = Encoding.GetEncoding(_encoding).GetString(responseBody);
_logger.Info($"请求{pageAddress},响应状态码: {(int) response.StatusCode}, 请求结果:{response.ReasonPhrase}");
EOF = false;
}
}
catch (HttpRequestException exception)
{
_logger.Warn($"请求{response.RequestMessage.RequestUri}失败,{exception.Message},10秒后重试...");
Thread.Sleep(TimeSpan.FromSeconds(10));
//尝试5次后放弃
_retry--;
if (_retry == 0)
{
EOF = false;
_logger.Error($"多次请求{response.RequestMessage.RequestUri}失败,已放弃。");
}
}
}
return responseString;
}
#endregion
#region 获取小说名
/// <summary>
/// 从获取小说名开始执行后续任务
/// </summary>
/// <param name="html">小说首页地址</param>
private async Task<string> GetBookName(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
// 内容解析 - 获得书名
var bookName = doc.DocumentNode.SelectSingleNode(_Xpath_Title).InnerText;
_logger.Info($"获取书名: 《{bookName}》");
return bookName;
}
#endregion
#region 获取小说章节
private void GetBookCatalogue(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//获取当前页面章节
//用手写XPATH最简单的写法,注意后面的li就是要取的列的标记,各网站不一定相同
HtmlNodeCollection catalogue = doc.DocumentNode.SelectNodes(_Xpath_Content);
foreach (var data in catalogue)
{
HtmlNode node = HtmlNode.CreateNode(data.OuterHtml);
HtmlNode a = node.SelectSingleNode("//a");
string u = a.Attributes["href"].Value;
//将章节写入章节列表
Catalogue cl = new Catalogue(data.InnerText, u);
bookCatalogues.Add(cl);
}
}
private async Task<string?> GetNextPage(string html)
{
string? nextpageUrl = String.Empty;
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
//判断并获取下一页地址
//用插件自动获取的XPATH,注意获取后需要在后面补充要取的列的标记,比如这里的 “/a”
HtmlNodeCollection page = doc.DocumentNode.SelectNodes(_Xpath_Nextpage);
foreach (var data in page)
{
HtmlNode node = HtmlNode.CreateNode(data.OuterHtml);
HtmlNode a = node.SelectSingleNode("//a");
string u = a.Attributes["href"].Value;
// Console.WriteLine($"{data.InnerText},url={u}");
if (data.InnerText.Contains("下一页"))
{
nextpageUrl = u;
}
}
return nextpageUrl;
}
#endregion
#region 获取章节内容
/// <summary>
/// 遍历获取章节内容
/// </summary>
/// <param name="ChapterUrl"></param>
/// <returns></returns>
private async Task<string> GetChapterContents(string ChapterUrl)
{
bool eof = true;
//章节内容变量
string? chapterContents = String.Empty;
//读取章节内容
//获取章节html
while (eof)
{
var chapterHtml = await GetHtmlAsync(ChapterUrl);
if (chapterHtml is not null)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(chapterHtml);
HtmlNodeCollection contentsList = doc.DocumentNode.SelectNodes(_Xpath_ChapterContent);
foreach (var content in contentsList)
{
// chapterContents += content.InnerText + Environment.NewLine;
// 如果要保留html标记比如<br>做特殊处理,用下面语句
chapterContents += content.InnerHtml;
}
var nextpage = doc.DocumentNode.SelectSingleNode(_Xpath_ChapterContent_Nextpage);
if (nextpage.InnerText.Trim().Equals("下一页"))
{
ChapterUrl = nextpage.Attributes["href"].Value;
}
else
{
eof = false;
}
}
}
return chapterContents;
}
#endregion
private static void Write2Txt(string log, string filepath)
{
try
{
string folder = filepath.Substring(0, filepath.LastIndexOf('\\'));
// 创建目录
if (Directory.Exists(folder) == false)
{
Directory.CreateDirectory(folder);
}
// 当文件已存在时删除文件
if (File.Exists(filepath) == true)
{
//FileStream fs = new FileStream(filepath, FileMode.Truncate, FileAccess.ReadWrite);
//fs.Close();
File.Delete(filepath);
File.AppendAllText(filepath, log + "\r\n", Encoding.Default);
}
else
{
//FileStream fs = File.Create(filepath);
//fs.Close();
File.Create(filepath).Close();
// 写入文件内容
File.AppendAllText(filepath, log + "\r\n", Encoding.Default);
}
}
catch (Exception exception)
{
_logger.Error(exception.Message);
}
}
}
调用方式
SpyderEngine spyderEngine = new SpyderEngine("http://m.xxxxxxxxx.net",
"GBK", null, null, null);
await spyderEngine.SpyderBook("http://xxxxxxxxxxx/xxxx/xx/1220_1/");
具体网站地址就不透露啦,好不容易找到个能爬最新江山云罗的网站:keai 其实C#语法非常先进,就用我文中的代码为例,既要实现任务并发,又要实现顺序锁,不管是Python还是JAVA都非常啰嗦,但C#就一句话搞定
Parallel.ForEach(bookCatalogues, new ParallelOptions() {MaxDegreeOfParallelism = _Max_Concurrency},
(_catalogue, loopState) =>
{
要并发的内容
});
单纯个人开发,其实C#的效率还是非常高的:lol 都7.0了,日子过得真快 收藏学习,这个语言的教程好像不多哟 .net发展很快啊 膜拜大佬 大佬牛逼,膜拜 真不错。太厉害了 学习大佬{:1_893:} 学习学习