[C#] 纯文本查看 复制代码
using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using HtmlAgilityPack;
using NLog;
namespace ConsoleBookSpyder.Sites
{
/// <summary>
/// 提供小说爬取和处理功能的静态类
/// </summary>
public static class shubaoju
{
#region Constants
private const string BaseUrl = "https://i.shubaoju.cc";
private const string Encoding = "GBK";
private const string XpathBookName = "//*[@id=\"_52mb_h1\"]";
private const string XpathChapterList = "//ul[@class='chapter']/li";
private const string XpathContents = "//*[@id=\"nr1\"]";
//目标网页抓取错误最大尝试次数
private const int MaxRetries = 5;
// 控制并发抓取章节内容的最大线程数
private static readonly int MaxConcurrency = 40;
#endregion
#region Fields
private static readonly Logger Logger = LogManager.GetCurrentClassLogger();
private static readonly HttpClient HttpClient = CreateHttpClient();
//任务进度参数
private static int _completedTasks;
private static int _totalTasks;
#endregion
#region Public Methods
/// <summary>
/// 执行小说爬取的主要流程
/// </summary>
/// <param name="targetUrl">目标小说的URL</param>
public static async Task ExecuteAsync(string targetUrl)
{
try
{
var html = await GetHtmlAsync(targetUrl);
var bookName = GetBookName(html);
var chapters = await GetBookChaptersAsync(html);
_totalTasks = chapters.Count; // 设置总任务数
_completedTasks = 0; // 初始化已完成任务数
Console.Title = $"任务进度:{_completedTasks}/{_totalTasks}"; // 更新Console.Title
var novelContent = await ProcessChaptersAsync(chapters);
var amendedContent = AmendContent(novelContent);
await WriteToFileAsync(amendedContent, GetFilePath(bookName));
}
catch (Exception ex)
{
Logger.Error(ex, "执行过程中发生错误");
}
}
#endregion
#region Private Methods
/// <summary>
/// 从HTML中提取小说名
/// </summary>
private static string GetBookName(string html)
{
var doc = new HtmlDocument();
doc.LoadHtml(html);
var bookName = doc.DocumentNode.SelectSingleNode(XpathBookName)?.InnerText.Trim() ?? "未知书名";
bookName = System.Web.HttpUtility.HtmlDecode(bookName);
Logger.Info($"获取书名: 《{bookName}》");
return bookName;
}
/// <summary>
/// 获取小说的所有章节信息
/// </summary>
private static async Task<List<Chapter>> GetBookChaptersAsync(string initialHtml)
{
var chapters = new List<Chapter>();
var currentPageUrl = BaseUrl;
var html = initialHtml;
while (true)
{
var doc = new HtmlDocument();
doc.LoadHtml(html);
var chapterNodes = doc.DocumentNode.SelectNodes(XpathChapterList);
if (chapterNodes != null)
{
foreach (var node in chapterNodes)
{
var aNode = node.SelectSingleNode(".//a");
if (aNode != null)
{
var title = aNode.InnerText.Trim();
var href = aNode.GetAttributeValue("href", "");
if (!string.IsNullOrEmpty(href))
{
chapters.Add(new Chapter(title, href));
}
}
}
}
else
{
Logger.Warn($"在页面 {currentPageUrl} 未找到章节列表");
}
// 查找"下一页"链接
var nextPageNode = doc.DocumentNode.SelectSingleNode("//a[contains(text(), '下一页')]");
if (nextPageNode != null)
{
var nextPageUrl = nextPageNode.GetAttributeValue("href", "");
if (!string.IsNullOrEmpty(nextPageUrl))
{
currentPageUrl = new Uri(new Uri(BaseUrl), nextPageUrl).AbsoluteUri;
Logger.Info($"正在获取下一页章节列表: {currentPageUrl}");
html = await GetHtmlAsync(currentPageUrl);
}
else
{
Logger.Warn("找到'下一页'链接,但URL为空");
break;
}
}
else
{
Logger.Info("未找到'下一页'链接,章节列表获取完成");
break;
}
}
if (chapters.Count == 0)
{
Logger.Warn("未找到任何章节");
}
else
{
Logger.Info($"总共获取到 {chapters.Count} 个章节");
}
return chapters;
}
/// <summary>
/// 处理所有章节,获取内容,使用并发控制
/// </summary>
private static async Task<string> ProcessChaptersAsync(List<Chapter> chapters)
{
var semaphore = new SemaphoreSlim(MaxConcurrency);
var tasks = chapters.Select(async chapter =>
{
try
{
await semaphore.WaitAsync();
chapter.Content = await GetChapterContentsAsync(chapter.Url);
UpdateConsoleTitle(); // 更新Console.Title
}
catch (Exception ex)
{
Logger.Error(ex, $"获取章节内容失败: {chapter.Url}");
}
finally
{
semaphore.Release();
}
});
await Task.WhenAll(tasks);
Console.Title = $"任务进度:{_totalTasks}/{_totalTasks}"; // 更新Console.Title为完成状态
return string.Join(Environment.NewLine + Environment.NewLine,
chapters.Select(chapter => $"{chapter.Title}{Environment.NewLine}{Environment.NewLine}{chapter.Content}"));
}
private static void UpdateConsoleTitle()
{
Interlocked.Increment(ref _completedTasks); // 增加已完成任务数
Console.Title = $"任务进度:{_completedTasks}/{_totalTasks}"; // 更新Console.Title
}
/// <summary>
/// 获取单个章节的内容
/// </summary>
private static async Task<string> GetChapterContentsAsync(string initialChapterUrl)
{
var contentBuilder = new StringBuilder();
var currentUrl = initialChapterUrl;
while (true)
{
var chapterHtml = await GetHtmlAsync(currentUrl);
var doc = new HtmlDocument();
doc.LoadHtml(chapterHtml);
var contentNodes = doc.DocumentNode.SelectNodes(XpathContents);
if (contentNodes != null)
{
foreach (var node in contentNodes)
{
//contentBuilder.AppendLine(node.InnerText.Trim());
contentBuilder.AppendLine(node.InnerHtml.Trim());
}
}
else
{
Logger.Warn($"在页面 {currentUrl} 未找到内容");
}
// 查找"下一页"链接
var nextPageNode = doc.DocumentNode.SelectSingleNode("//a[contains(text(), '下一页')]");
if (nextPageNode != null)
{
var nextPageUrl = nextPageNode.GetAttributeValue("href", "");
if (!string.IsNullOrEmpty(nextPageUrl))
{
currentUrl = new Uri(new Uri(BaseUrl), nextPageUrl).AbsoluteUri;
Logger.Info($"正在获取下一页内容: {currentUrl}");
}
else
{
Logger.Info("找到'下一页'链接,但URL为空,章节内容获取完成");
break;
}
}
else
{
Logger.Info("未找到'下一页'链接,章节内容获取完成");
break;
}
}
var content = contentBuilder.ToString().Trim();
Logger.Info($"获取到的章节内容长度: {content.Length} 字符");
return content;
}
/// <summary>
/// 处理和清理小说内容
/// </summary>
private static string AmendContent(string content)
{
content = content.Replace("</p><p>", Environment.NewLine + Environment.NewLine)
.Replace("<br>", Environment.NewLine)
.Replace("<br/>", Environment.NewLine)
.Replace("<br />", Environment.NewLine);
content = System.Web.HttpUtility.HtmlDecode(content);
//字数:.*(新群)\*:匹配"字数:“后面跟着任意内容,最后以”(新群)*"结尾的部分。
//-->>.* 继续阅读):匹配"–>>"后面跟着任意内容,最后以"继续阅读)"结尾的部分。
//|:表示逻辑或,匹配前一个或后一个表达式中的任意一个。
content = System.Text.RegularExpressions.Regex.Replace(content, @"字数:.*(新群)\*|-->>.*继续阅读)|【.*】", "\r\n\r\n");
var regex = new Regex("<.*?>", RegexOptions.Compiled);
return regex.Replace(content, string.Empty);
}
/// <summary>
/// 将处理后的内容写入文件
/// </summary>
private static async Task WriteToFileAsync(string content, string filePath)
{
try
{
var directory = Path.GetDirectoryName(filePath);
if (!string.IsNullOrEmpty(directory))
{
Directory.CreateDirectory(directory);
}
await File.WriteAllTextAsync(filePath, content, System.Text.Encoding.Default);
Logger.Info($"文件已成功写入: {filePath}");
}
catch (Exception ex)
{
Logger.Error(ex, $"写入文件时发生错误: {filePath}");
}
}
/// <summary>
/// 生成保存小说内容的文件路径
/// </summary>
private static string GetFilePath(string bookName)
{
var uri = new Uri(BaseUrl);
var domain = uri.Host;
return Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "下载", domain, $"{bookName}.txt");
}
/// <summary>
/// 获取指定URL的HTML内容
/// </summary>
private static async Task<string> GetHtmlAsync(string url)
{
for (int i = 0; i < MaxRetries; i++)
{
try
{
var response = await HttpClient.GetAsync(url);
response.EnsureSuccessStatusCode();
var responseBody = await response.Content.ReadAsByteArrayAsync();
var responseString = System.Text.Encoding.GetEncoding(Encoding).GetString(responseBody);
Logger.Info($"请求 {url} 成功,响应状态码: {(int)response.StatusCode}");
return responseString;
}
catch (HttpRequestException ex)
{
Logger.Warn($"请求 {url} 失败,{ex.Message},10秒后重试...");
await Task.Delay(TimeSpan.FromSeconds(10));
}
}
Logger.Error($"多次请求 {url} 失败,已放弃。");
return string.Empty;
}
/// <summary>
/// 创建并配置HttpClient实例
/// </summary>
private static HttpClient CreateHttpClient()
{
var client = new HttpClient
{
BaseAddress = new Uri(BaseUrl)
};
client.DefaultRequestHeaders.Clear();
client.DefaultRequestHeaders.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
client.DefaultRequestHeaders.Add("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6");
client.DefaultRequestHeaders.Add("Cache-Control", "max-age=0");
client.DefaultRequestHeaders.Add("Connection", "keep-alive");
//Cookie可要可不要,请根据目标网站是否需要登录自行修改
client.DefaultRequestHeaders.Add("Cookie", "cJumpPV10861=1; autojumpPV10861=6; __51vcke__K0XyxJ3OaBItNhg9=3b3030a8-3fd7-5de0-8777-5218fa3ae91f; __51vuft__K0XyxJ3OaBItNhg9=1719545541437; __51uvsct__K0XyxJ3OaBItNhg9=7; PHPSESSID=363b7e9b0cf6d31cb701f1b747f8bc9c; autojumpStats10861=1; autojumpNum10861=1; cJumpPV10861=1; autojumpPV10861=3; __vtins__K0XyxJ3OaBItNhg9=%7B%22sid%22%3A%20%228336a7ae-3043-5bf6-a7c4-736a2d7c0f9c%22%2C%20%22vd%22%3A%208%2C%20%22stt%22%3A%20334475%2C%20%22dr%22%3A%203517%2C%20%22expires%22%3A%201719579012508%2C%20%22ct%22%3A%201719577212508%7D");
client.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1");
client.DefaultRequestHeaders.Add("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36 Edg/119.0.0.0");
return client;
}
#endregion
#region Inner Classes
/// <summary>
/// 表示小说的一个章节
/// </summary>
private class Chapter
{
public string Title { get; }
public string Url { get; }
public string Content { get; set; }
public Chapter(string title, string url)
{
Title = title;
Url = url;
Content = string.Empty;
}
}
#endregion
}
}