C# 多线程缓存技术小说极速爬取
internal class Program{
static async Task Main(string[] args)
{
var redis = Redis.GetInstance();
Book book = new Book();
book.Url = "http://www.tycqzw.net/113_113116/";
await book.SetItems();
await book.SetItemsContent();
var items = book.Items.OrderBy(x => x.Path).ToList();
StringBuilder stringBuilder = new StringBuilder();
foreach (var item in items)
{
stringBuilder.Append(item.Title+"\n");
stringBuilder.Append(item.Content);
}
File.WriteAllText(book.Title+".txt", stringBuilder.ToString());
Console.ReadKey();
}
}
public class Book
{
public Book()
{
if (Items == null) Items = new List<BookItem>();
}
public string Title { get; set; }
public string Url { get; set; }
public List<BookItem> Items { get; set; }
public void Add(BookItem item)
{
if (Items.Where(n => n.Path == item.Path).Count() > 0) return;
Items.Add(item);
}
internal async Task SetItems()
{
var redis = Redis.GetInstance();
var json = redis.Get(this.Title);
if (!string.IsNullOrWhiteSpace(json))
{
this.Items = JsonConvert.DeserializeObject<List<BookItem>>(json);
return;
}
var client = new RestClient(this.Url);
var request = new RestRequest();
var response = await client.ExecuteAsync(request);
var content = response.Content;
var parser = new HtmlParser();
var doc = parser.ParseDocument(content);
var title = doc.QuerySelector("#info h1").InnerHtml;
var dda = doc.QuerySelectorAll("dd a");
this.Title = title;
Console.WriteLine(this.Title);
foreach (IHtmlAnchorElement item in dda)
{
Items.Add(new BookItem() { Title = item.InnerHtml, Path = item.PathName });
}
redis.Set(this.Title, JsonConvert.SerializeObject(this.Items));
}
internal async Task SetItemsContent()
{
var index = 0;
var redis = Redis.GetInstance();
// 初始化信号量,允许的最大并发数为10
SemaphoreSlim semaphore = new SemaphoreSlim(1000, 1000);
// 任务列表
List<Task> tasks = new List<Task>();
async Task ProcessTask(BookItem bookItem)
{
await semaphore.WaitAsync();
try
{
string mess = "";
mess += bookItem.Title;
await bookItem.SetContent();
mess += "-获取完成";
index++;
if (index % 10 == 0)
{
redis.Set(this.Title, JsonConvert.SerializeObject(this.Items));
mess += "-写入缓存";
}
Console.WriteLine(mess);
}
finally
{
// 释放信号量,允许下一个任务开始
semaphore.Release();
}
}
foreach (var item in this.Items)
{
if (item.Content == null)
{
tasks.Add(ProcessTask(item));
}
}
// 等待所有任务完成
await Task.WhenAll(tasks);
redis.Set(this.Title, JsonConvert.SerializeObject(this.Items));
}
}
public class BookItem
{
public string Title { get; set; }
public string Path { get; set; }
public string Content { get; set; }
internal async Task SetContent()
{
try
{
var client = new RestClient("http://www.tycqzw.net/" + Path);
var request = new RestRequest();
var response = await client.ExecuteAsync(request);
var content = response.Content;
var parser = new HtmlParser();
var doc = parser.ParseDocument(content);
IHtmlDivElement contentMain = doc.QuerySelector("#content") as IHtmlDivElement;
this.Content = contentMain.TextContent;
}
catch (Exception)
{
}
}
}
public class Redis
{
private static RedisClient redisClient;
private Redis() { }
public static RedisClient GetInstance()
{
if (redisClient == null) {
redisClient = new RedisClient("127.0.0.1:6379,defaultDatabase=12");
}
return redisClient;
}
}
感谢分享。。 感谢大佬分享 难得看到C#的教程。支持 感谢分享,大神牛x,必须点赞 看论坛多数是用python的,我用C Sharp 学习了!!! 不会用,大神谁帮忙打包下成品啊 好东西,抽空要学习一下
页:
[1]