吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 1455|回复: 7
收起左侧

[其他原创] C# 多线程缓存技术小说极速爬取

  [复制链接]
TZ糖纸 发表于 2024-8-14 10:08
[C#] 纯文本查看 复制代码
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
internal class Program
{
    static async Task Main(string[] args)
    {
 
        var redis = Redis.GetInstance();
 
        Book book = new Book();
        book.Url = "http://www.tycqzw.net/113_113116/";
        await book.SetItems();
 
        await book.SetItemsContent();
 
        var items = book.Items.OrderBy(x => x.Path).ToList();
         
        StringBuilder stringBuilder = new StringBuilder(); 
 
        foreach (var item in items)
        {
            stringBuilder.Append(item.Title+"\n");
            stringBuilder.Append(item.Content);
        }
 
        File.WriteAllText(book.Title+".txt", stringBuilder.ToString());
 
        Console.ReadKey();
 
    }
}




[C#] 纯文本查看 复制代码
001
002
003
004
005
006
007
008
009
010
011
012
013
014
015
016
017
018
019
020
021
022
023
024
025
026
027
028
029
030
031
032
033
034
035
036
037
038
039
040
041
042
043
044
045
046
047
048
049
050
051
052
053
054
055
056
057
058
059
060
061
062
063
064
065
066
067
068
069
070
071
072
073
074
075
076
077
078
079
080
081
082
083
084
085
086
087
088
089
090
091
092
093
094
095
096
097
098
099
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
public class Book
{
 
    public Book()
    {
        if (Items == null) Items = new List<BookItem>();
    }
    public string Title { get; set; }
    public string Url { get; set; }
    public List<BookItem> Items { get; set; }
 
    public void Add(BookItem item)
    {
        if (Items.Where(n => n.Path == item.Path).Count() > 0) return;
        Items.Add(item);
    }
 
    internal async Task SetItems()
    {
        var redis = Redis.GetInstance();
 
        var json = redis.Get(this.Title);
        if (!string.IsNullOrWhiteSpace(json))
        {
            this.Items = JsonConvert.DeserializeObject<List<BookItem>>(json);
            return;
        }
 
        var client = new RestClient(this.Url);
        var request = new RestRequest();
        var response = await client.ExecuteAsync(request);
        var content = response.Content;
        var parser = new HtmlParser();
        var doc = parser.ParseDocument(content);
 
        var title = doc.QuerySelector("#info h1").InnerHtml;
        var dda = doc.QuerySelectorAll("dd a");
 
        this.Title = title;
        Console.WriteLine(this.Title);
        foreach (IHtmlAnchorElement item in dda)
        {
            Items.Add(new BookItem() { Title = item.InnerHtml, Path = item.PathName });
 
        }
        redis.Set(this.Title, JsonConvert.SerializeObject(this.Items));
 
    }
 
    internal async Task SetItemsContent()
    {
        var index = 0;
        var redis = Redis.GetInstance();
 
 
        // 初始化信号量,允许的最大并发数为10
        SemaphoreSlim semaphore = new SemaphoreSlim(1000, 1000);
        // 任务列表
        List<Task> tasks = new List<Task>();
 
        async Task ProcessTask(BookItem bookItem)
        {
            await semaphore.WaitAsync();
            try
            {
                string mess = "";
                mess += bookItem.Title;
                await bookItem.SetContent();
                mess += "-获取完成";
                index++;
                if (index % 10 == 0)
                {
                    redis.Set(this.Title, JsonConvert.SerializeObject(this.Items));
                    mess += "-写入缓存";
                }
                Console.WriteLine(mess);
            }
            finally
            {
                // 释放信号量,允许下一个任务开始
                semaphore.Release();
            }
        }
 
        foreach (var item in this.Items)
        {
            if (item.Content == null)
            {
 
                tasks.Add(ProcessTask(item));
            }
 
        }
 
        // 等待所有任务完成
        await Task.WhenAll(tasks);
 
        redis.Set(this.Title, JsonConvert.SerializeObject(this.Items));
    }
}
public class BookItem
{
    public string Title { get; set; }
    public string Path { get; set; }
    public string Content { get; set; }
 
    internal async Task SetContent()
    {
        try
        {
            var client = new RestClient("http://www.tycqzw.net/" + Path);
            var request = new RestRequest();
            var response = await client.ExecuteAsync(request);
            var content = response.Content;
            var parser = new HtmlParser();
            var doc = parser.ParseDocument(content);
            IHtmlDivElement contentMain = doc.QuerySelector("#content") as IHtmlDivElement;
            this.Content = contentMain.TextContent;
        }
        catch (Exception)
        {
 
        }
 
    }
}



[C#] 纯文本查看 复制代码
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
public class Redis
{
    private static RedisClient redisClient;
 
    private Redis() { }
 
    public static RedisClient GetInstance()
    {
        if (redisClient == null) {
            redisClient = new RedisClient("127.0.0.1:6379,defaultDatabase=12");
        }
 
        return redisClient;
    }
}

e4d6d2de0be7974b7249f2fcdc10db9.png

免费评分

参与人数 6吾爱币 +13 热心值 +6 收起 理由
yekai2024 + 1 + 1 我很赞同!
苏紫方璇 + 7 + 1 欢迎分析讨论交流,吾爱破解论坛有你更精彩!
1783780690 + 1 + 1 热心回复!
a5436539 + 1 + 1 我很赞同!
williamlyf + 1 + 1 学习,加个注释就更完美了...
wkfy + 2 + 1 我很赞同!

查看全部评分

本帖被以下淘专辑推荐:

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

小小小酥 发表于 2024-8-14 13:38
感谢大佬分享
SONGXINGJING520 发表于 2024-8-15 10:32
caisonglinlove 发表于 2024-8-16 03:50
msmvc 发表于 2024-8-17 19:01
看论坛多数是用python的,我用C Sharp
findevery 发表于 2024-8-23 13:34
学习了!!!
81169009 发表于 2024-9-5 18:40
不会用,大神谁帮忙打包下成品啊
xyj152 发表于 2024-9-6 08:15
好东西,抽空要学习一下
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2025-4-1 23:37

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表