.net 爬取网页图片
话不多说,直接上代码,大体功能实现了,里面有些错误处理没处理使用方法:spider.exe images其中images为指定的存储目录,也可以指定绝对路径
也可以打包成linux程序在linux上使用,在后面提供windows/linux下的单文件程序
```
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Net.Http;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace spider
{
/// <summary>
/// 结果结构体
/// </summary>
struct Result
{
public string url;
public string title;
}
class Program
{
private static readonly string[] UA ={
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Linux; U; android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
};
//http客服端
private static readonly HttpClient client = new HttpClient();
//下载地址
private static readonly string url = "https://www.mzitu.com/mm/";
static async Task Main(string[] args)
{
//设置保存路径,默认保存在程序同级目录
string storagePath = args.Length >= 1 ? args : "";
//设置请求头
SetReferer(client, url);
Random rd = new Random();
client.DefaultRequestHeaders.Add("User-Agent", UA);
//获取总页数
int pages = await GetMaxPage(url, "//div[@class='pagination']/div/a");
for (int i = 1; i <= pages; i++)
{
//获取每页的链接和标题
string currUrl = url + $"page/{i}";
Console.WriteLine(currUrl);
List<Result> results = await GetCurrentPageLinks(currUrl);
foreach (Result result in results)
{
//Console.WriteLine($"{result.url}----{result.title}");
if (!Directory.Exists(Path.Combine(storagePath, result.title)))
{
Directory.CreateDirectory(Path.Combine(storagePath, result.title));
}
int imagePages = await GetMaxPage(result.url, "//div[@class='pagenavi']/a");
for (int j = 1; j <= imagePages; j++)
{
string imagePageUrl = result.url + $"/{j}";
Console.WriteLine(imagePageUrl);
string imageUrl = await GetImageUrl(imagePageUrl);
await DownloadAsync(imageUrl, Path.Combine(storagePath, result.title));
}
//Thread.Sleep(2000);
}
//Thread.Sleep(3000);
}
}
/// <summary>
/// 发送get请求
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static async Task<string> GetByAsync(string url)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
try
{
HttpResponseMessage responseMessage = await client.GetAsync(url);
HttpContent content = responseMessage.Content;
string contentString = await content.ReadAsStringAsync();
return contentString;
}
catch (Exception ex)
{
Console.WriteLine("请求错误:" + ex.ToString());
return "";
}
}
/// <summary>
/// 下载图片并保存
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static async Task DownloadAsync(string url, string dir)
{
Console.WriteLine($"正在下载:{url}");
string fileName = Path.GetFileName(url);
string filePath = Path.Combine(dir, fileName);
byte[] byteArr = await client.GetByteArrayAsync(url);
using (FileStream fs = new FileStream(filePath, FileMode.Create, FileAccess.Write))
{
fs.Write(byteArr, 0, byteArr.Length);
fs.Close();
fs.Dispose();
};
Console.WriteLine($"图片{url}下载完成");
}
/// <summary>
/// 获取页面上的最大页数
/// </summary>
/// <param name="url"></param>
/// <param name="pattern"></param>
/// <returns></returns>
static async Task<int> GetMaxPage(string url, string pattern)
{
HtmlNode documentNode = await GetDocumentNode(url);
HtmlNode Node = documentNode.SelectSingleNode(pattern);
return Node != null ? int.Parse(Node.InnerText) : 0;
}
/// <summary>
/// 获取列表页面的链接和标题
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static async Task<List<Result>> GetCurrentPageLinks(string url)
{
List<Result> LinkList = new List<Result>();
HtmlNode documentNode = await GetDocumentNode(url);
HtmlNodeCollection NodeCollection = documentNode.SelectNodes("//ul[@id='pins']/li/span/a");
if(NodeCollection == null)
{
return LinkList;
}
foreach (HtmlNode Node in NodeCollection)
{
string href = Node.GetAttributeValue("href", "");
string title = Node.InnerText;
Result result = new Result { url = href, title = title };
LinkList.Add(result);
}
return LinkList;
}
/// <summary>
/// 获取HTML Document
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static async Task<HtmlNode> GetDocumentNode(string url)
{
HtmlDocument doc = new HtmlDocument();
string html = await GetByAsync(url);
doc.LoadHtml(html);
HtmlNode documentNode = doc.DocumentNode;
return documentNode;
}
/// <summary>
/// 获取页面上图片的地址
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
static async Task<string> GetImageUrl(string url)
{
HtmlNode documentNode = await GetDocumentNode(url);
HtmlNode Node = documentNode.SelectSingleNode("//div[@class='main-image']/p/a/img");
string src = Node.GetAttributeValue("src", "");
return src;
}
/// <summary>
/// 设置请求Refer而
/// </summary>
/// <param name="client"></param>
/// <param name="url"></param>
static void SetReferer(HttpClient client, string url)
{
Uri uri = new Uri(url);
string path = uri.LocalPath;
client.DefaultRequestHeaders.Add("Referer", url.Replace(path, ""));
}
}
}
```
> 文件下载地址:https://wws.lanzouj.com/iXS4lla6xrc 密码:6uu5压缩包里面是一个exe程序和一个linux程序 {:1_921:}{:1_921:}
GetByAsync可以使用HttpClient.GetStringAsync代替
DownloadAsync也可使用HttpClient.GetStreamAsync().CopyTo(fs)简化 很强,但是看过py的爬图片,我一下子觉得.net是不是繁琐了 感谢分享,正好需要这方面示例{:1_893:} 感谢楼主 分享
页:
[1]