using
System;
using
System.Collections.Generic;
using
System.Net.Http;
using
System.Text;
using
System.Text.RegularExpressions;
using
System.Threading.Tasks;
using
HtmlAgilityPack;
using
NLog;
namespace
ConsoleBookSpyder.Sites
{
/// <summary>
/// 提供小说爬取和处理功能的静态类
/// </summary>
public
static
class
shubaoju
{
#region Constants
private
const
string
BaseUrl =
"https://i.shubaoju.cc"
;
private
const
string
Encoding =
"GBK"
;
private
const
string
XpathBookName =
"//*[@id=\"_52mb_h1\"]"
;
private
const
string
XpathChapterList =
"//ul[@class='chapter']/li"
;
private
const
string
XpathContents =
"//*[@id=\"nr1\"]"
;
private
const
int
MaxRetries = 5;
private
static
readonly
int
MaxConcurrency = 40;
#endregion
#region Fields
private
static
readonly
Logger Logger = LogManager.GetCurrentClassLogger();
private
static
readonly
HttpClient HttpClient = CreateHttpClient();
private
static
int
_completedTasks;
private
static
int
_totalTasks;
#endregion
#region Public Methods
/// <summary>
/// 执行小说爬取的主要流程
/// </summary>
/// <param name="targetUrl">目标小说的URL</param>
public
static
async
Task ExecuteAsync(
string
targetUrl)
{
try
{
var
html =
await
GetHtmlAsync(targetUrl);
var
bookName = GetBookName(html);
var
chapters =
await
GetBookChaptersAsync(html);
_totalTasks = chapters.Count;
_completedTasks = 0;
Console.Title = $
"任务进度:{_completedTasks}/{_totalTasks}"
;
var
novelContent =
await
ProcessChaptersAsync(chapters);
var
amendedContent = AmendContent(novelContent);
await
WriteToFileAsync(amendedContent, GetFilePath(bookName));
}
catch
(Exception ex)
{
Logger.Error(ex,
"执行过程中发生错误"
);
}
}
#endregion
#region Private Methods
/// <summary>
/// 从HTML中提取小说名
/// </summary>
private
static
string
GetBookName(
string
html)
{
var
doc =
new
HtmlDocument();
doc.LoadHtml(html);
var
bookName = doc.DocumentNode.SelectSingleNode(XpathBookName)?.InnerText.Trim() ??
"未知书名"
;
bookName = System.Web.HttpUtility.HtmlDecode(bookName);
Logger.Info($
"获取书名: 《{bookName}》"
);
return
bookName;
}
/// <summary>
/// 获取小说的所有章节信息
/// </summary>
private
static
async
Task<List<Chapter>> GetBookChaptersAsync(
string
initialHtml)
{
var
chapters =
new
List<Chapter>();
var
currentPageUrl = BaseUrl;
var
html = initialHtml;
while
(
true
)
{
var
doc =
new
HtmlDocument();
doc.LoadHtml(html);
var
chapterNodes = doc.DocumentNode.SelectNodes(XpathChapterList);
if
(chapterNodes !=
null
)
{
foreach
(
var
node
in
chapterNodes)
{
var
aNode = node.SelectSingleNode(
".//a"
);
if
(aNode !=
null
)
{
var
title = aNode.InnerText.Trim();
var
href = aNode.GetAttributeValue(
"href"
,
""
);
if
(!
string
.IsNullOrEmpty(href))
{
chapters.Add(
new
Chapter(title, href));
}
}
}
}
else
{
Logger.Warn($
"在页面 {currentPageUrl} 未找到章节列表"
);
}
var
nextPageNode = doc.DocumentNode.SelectSingleNode(
"//a[contains(text(), '下一页')]"
);
if
(nextPageNode !=
null
)
{
var
nextPageUrl = nextPageNode.GetAttributeValue(
"href"
,
""
);
if
(!
string
.IsNullOrEmpty(nextPageUrl))
{
currentPageUrl =
new
Uri(
new
Uri(BaseUrl), nextPageUrl).AbsoluteUri;
Logger.Info($
"正在获取下一页章节列表: {currentPageUrl}"
);
html =
await
GetHtmlAsync(currentPageUrl);
}
else
{
Logger.Warn(
"找到'下一页'链接,但URL为空"
);
break
;
}
}
else
{
Logger.Info(
"未找到'下一页'链接,章节列表获取完成"
);
break
;
}
}
if
(chapters.Count == 0)
{
Logger.Warn(
"未找到任何章节"
);
}
else
{
Logger.Info($
"总共获取到 {chapters.Count} 个章节"
);
}
return
chapters;
}
/// <summary>
/// 处理所有章节,获取内容,使用并发控制
/// </summary>
private
static
async
Task<
string
> ProcessChaptersAsync(List<Chapter> chapters)
{
var
semaphore =
new
SemaphoreSlim(MaxConcurrency);
var
tasks = chapters.Select(
async
chapter =>
{
try
{
await
semaphore.WaitAsync();
chapter.Content =
await
GetChapterContentsAsync(chapter.Url);
UpdateConsoleTitle();
}
catch
(Exception ex)
{
Logger.Error(ex, $
"获取章节内容失败: {chapter.Url}"
);
}
finally
{
semaphore.Release();
}
});
await
Task.WhenAll(tasks);
Console.Title = $
"任务进度:{_totalTasks}/{_totalTasks}"
;
return
string
.Join(Environment.NewLine + Environment.NewLine,
chapters.Select(chapter => $
"{chapter.Title}{Environment.NewLine}{Environment.NewLine}{chapter.Content}"
));
}
private
static
void
UpdateConsoleTitle()
{
Interlocked.Increment(
ref
_completedTasks);
Console.Title = $
"任务进度:{_completedTasks}/{_totalTasks}"
;
}
/// <summary>
/// 获取单个章节的内容
/// </summary>
private
static
async
Task<
string
> GetChapterContentsAsync(
string
initialChapterUrl)
{
var
contentBuilder =
new
StringBuilder();
var
currentUrl = initialChapterUrl;
while
(
true
)
{
var
chapterHtml =
await
GetHtmlAsync(currentUrl);
var
doc =
new
HtmlDocument();
doc.LoadHtml(chapterHtml);
var
contentNodes = doc.DocumentNode.SelectNodes(XpathContents);
if
(contentNodes !=
null
)
{
foreach
(
var
node
in
contentNodes)
{
contentBuilder.AppendLine(node.InnerHtml.Trim());
}
}
else
{
Logger.Warn($
"在页面 {currentUrl} 未找到内容"
);
}
var
nextPageNode = doc.DocumentNode.SelectSingleNode(
"//a[contains(text(), '下一页')]"
);
if
(nextPageNode !=
null
)
{
var
nextPageUrl = nextPageNode.GetAttributeValue(
"href"
,
""
);
if
(!
string
.IsNullOrEmpty(nextPageUrl))
{
currentUrl =
new
Uri(
new
Uri(BaseUrl), nextPageUrl).AbsoluteUri;
Logger.Info($
"正在获取下一页内容: {currentUrl}"
);
}
else
{
Logger.Info(
"找到'下一页'链接,但URL为空,章节内容获取完成"
);
break
;
}
}
else
{
Logger.Info(
"未找到'下一页'链接,章节内容获取完成"
);
break
;
}
}
var
content = contentBuilder.ToString().Trim();
Logger.Info($
"获取到的章节内容长度: {content.Length} 字符"
);
return
content;
}
/// <summary>
/// 处理和清理小说内容
/// </summary>
private
static
string
AmendContent(
string
content)
{
content = content.Replace(
"</p><p>"
, Environment.NewLine + Environment.NewLine)
.Replace(
"<br>"
, Environment.NewLine)
.Replace(
"<br/>"
, Environment.NewLine)
.Replace(
"<br />"
, Environment.NewLine);
content = System.Web.HttpUtility.HtmlDecode(content);
content = System.Text.RegularExpressions.Regex.Replace(content,
@"字数:.*(新群)\*|-->>.*继续阅读)|【.*】"
,
"\r\n\r\n"
);
var
regex =
new
Regex(
"<.*?>"
, RegexOptions.Compiled);
return
regex.Replace(content,
string
.Empty);
}
/// <summary>
/// 将处理后的内容写入文件
/// </summary>
private
static
async
Task WriteToFileAsync(
string
content,
string
filePath)
{
try
{
var
directory = Path.GetDirectoryName(filePath);
if
(!
string
.IsNullOrEmpty(directory))
{
Directory.CreateDirectory(directory);
}
await
File.WriteAllTextAsync(filePath, content, System.Text.Encoding.Default);
Logger.Info($
"文件已成功写入: {filePath}"
);
}
catch
(Exception ex)
{
Logger.Error(ex, $
"写入文件时发生错误: {filePath}"
);
}
}
/// <summary>
/// 生成保存小说内容的文件路径
/// </summary>
private
static
string
GetFilePath(
string
bookName)
{
var
uri =
new
Uri(BaseUrl);
var
domain = uri.Host;
return
Path.Combine(AppDomain.CurrentDomain.BaseDirectory,
"下载"
, domain, $
"{bookName}.txt"
);
}
/// <summary>
/// 获取指定URL的HTML内容
/// </summary>
private
static
async
Task<
string
> GetHtmlAsync(
string
url)
{
for
(
int
i = 0; i < MaxRetries; i++)
{
try
{
var
response =
await
HttpClient.GetAsync(url);
response.EnsureSuccessStatusCode();
var
responseBody =
await
response.Content.ReadAsByteArrayAsync();
var
responseString = System.Text.Encoding.GetEncoding(Encoding).GetString(responseBody);
Logger.Info($
"请求 {url} 成功,响应状态码: {(int)response.StatusCode}"
);
return
responseString;
}
catch
(HttpRequestException ex)
{
Logger.Warn($
"请求 {url} 失败,{ex.Message},10秒后重试..."
);
await
Task.Delay(TimeSpan.FromSeconds(10));
}
}
Logger.Error($
"多次请求 {url} 失败,已放弃。"
);
return
string
.Empty;
}
/// <summary>
/// 创建并配置HttpClient实例
/// </summary>
private
static
HttpClient CreateHttpClient()
{
var
client =
new
HttpClient
{
BaseAddress =
new
Uri(BaseUrl)
};
client.DefaultRequestHeaders.Clear();
client.DefaultRequestHeaders.Add(
"Accept"
,
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
);
client.DefaultRequestHeaders.Add(
"Accept-Language"
,
"zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
);
client.DefaultRequestHeaders.Add(
"Cache-Control"
,
"max-age=0"
);
client.DefaultRequestHeaders.Add(
"Connection"
,
"keep-alive"
);
client.DefaultRequestHeaders.Add(
"Cookie"
,
"cJumpPV10861=1; autojumpPV10861=6; __51vcke__K0XyxJ3OaBItNhg9=3b3030a8-3fd7-5de0-8777-5218fa3ae91f; __51vuft__K0XyxJ3OaBItNhg9=1719545541437; __51uvsct__K0XyxJ3OaBItNhg9=7; PHPSESSID=363b7e9b0cf6d31cb701f1b747f8bc9c; autojumpStats10861=1; autojumpNum10861=1; cJumpPV10861=1; autojumpPV10861=3; __vtins__K0XyxJ3OaBItNhg9=%7B%22sid%22%3A%20%228336a7ae-3043-5bf6-a7c4-736a2d7c0f9c%22%2C%20%22vd%22%3A%208%2C%20%22stt%22%3A%20334475%2C%20%22dr%22%3A%203517%2C%20%22expires%22%3A%201719579012508%2C%20%22ct%22%3A%201719577212508%7D"
);
client.DefaultRequestHeaders.Add(
"Upgrade-Insecure-Requests"
,
"1"
);
client.DefaultRequestHeaders.Add(
"User-Agent"
,
"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Mobile Safari/537.36 Edg/119.0.0.0"
);
return
client;
}
#endregion
#region Inner Classes
/// <summary>
/// 表示小说的一个章节
/// </summary>
private
class
Chapter
{
public
string
Title {
get
; }
public
string
Url {
get
; }
public
string
Content {
get
;
set
; }
public
Chapter(
string
title,
string
url)
{
Title = title;
Url = url;
Content =
string
.Empty;
}
}
#endregion
}
}