本帖最后由 TZ糖纸 于 2020-8-12 10:43 编辑
这里使用了HtmlAgilityPack以及HttpHelper
HtmlAgilityPack可以直接在nuget下载
HttpHelper请自行百度
Gitee地址:https://gitee.com/TZTZTZ980929/Spider
[C#] 纯文本查看 复制代码 using SufeiUtil;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Threading;
namespace Spider
{
public class MeiZiTu
{
public string rootPath = null;
public int pageCount = 0;
HtmlAgilityPack.HtmlDocument dc = new HtmlAgilityPack.HtmlDocument();
public void Test1()
{
if (rootPath == null)
{
rootPath = System.AppDomain.CurrentDomain.SetupInformation.ApplicationBase + "mzitu\\";
checkDir(rootPath);
}
var allUrl = "https://www.mzitu.com/all/";
//拿到每日更新页面的html
var allHtml = GetHtml(allUrl);
var allList = GetALlUrl(allHtml);
foreach (var url in allList)
{
if (url.Contains("old")) continue;
var pageHtml = GetHtml(url);
downloadAllImage(url, pageHtml);
this.pageCount = 0;
}
}
private void downloadAllImage(string url, string pageHtml)
{
try
{
dc.LoadHtml(pageHtml);
var title = dc.DocumentNode.SelectSingleNode("//h2[@class='main-title']").InnerText;
var imgUrl = dc.DocumentNode.SelectSingleNode("//div[@class='main-image']/p/a/img").Attributes["src"].Value;
var pagenav = dc.DocumentNode.SelectNodes("//div[@class='pagenavi']").Descendants("a");
var pagenavCount = pagenav.Count();
var pageCount = pagenav.ToArray()[pagenavCount - 2].InnerText;
this.pageCount = Convert.ToInt32(pageCount);
for (int i = 1; i <= this.pageCount; i++)
{
var jumpurl = "";
if (i == 1)
jumpurl = url;
else
jumpurl = url + "/" + i;
var html = GetHtml(jumpurl);
downloadImage(html);
Thread.Sleep(2000);
}
}
catch (Exception ex)
{
}
}
private void downloadImage(string html)
{
dc.LoadHtml(html);
var title = dc.DocumentNode.SelectSingleNode("//h2[@class='main-title']").InnerText.Replace("?","").Replace("!", "");
var imgUrl = dc.DocumentNode.SelectSingleNode("//div[@class='main-image']/p/a/img").Attributes["src"].Value;
var byteImage = GetBytesFromUrl(imgUrl);
WriteBytesToFile(title, byteImage);
}
public byte[] GetBytesFromUrl(string url)
{
byte[] b;
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(url);
myReq.Referer = url;
WebResponse myResp = myReq.GetResponse();
Stream stream = myResp.GetResponseStream();
//int i;
using (BinaryReader br = new BinaryReader(stream))
{
//i = (int)(stream.Length);
b = br.ReadBytes(500000);
br.Close();
}
myResp.Close();
return b;
}
public void WriteBytesToFile(string fileName, byte[] content)
{
Console.WriteLine(this.rootPath + fileName + ".jpg");
FileStream fs = new FileStream(this.rootPath + fileName + ".jpg", FileMode.Create);
BinaryWriter w = new BinaryWriter(fs);
try
{
w.Write(content);
}
finally
{
fs.Close();
w.Close();
}
}
public static bool checkDir(string url)
{
try
{
if (!Directory.Exists(url))//如果不存在就创建file文件夹
Directory.CreateDirectory(url);//创建该文件夹
return true;
}
catch (Exception ex)
{
return false;
}
}
private List<string> GetALlUrl(string allHtml)
{
List<string> urlList = new List<string>();
dc.LoadHtml(allHtml);
var all = dc.DocumentNode.SelectNodes("//div[@class='all']");
var allUrl = all.Descendants("a");
foreach (var url in allUrl)
{
urlList.Add(url.Attributes["href"].Value);
}
return urlList;
}
public string GetHtml(string url)
{
HttpHelper http = new HttpHelper();
HttpItem item = new HttpItem()
{
URL = url,
Method = "GET",
Timeout = 100000,
ReadWriteTimeout = 30000,
IsToLower = false,
Cookie = "",
UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",//用户的浏览器类型,版本,操作系统 可选项有默认值
Accept = "text/html, application/xhtml+xml, */*",
ContentType = "text/html",
Referer = "",
Allowautoredirect = false,
Postdata = "",
ResultType = ResultType.String,
};
HttpResult result = http.GetHtml(item);
string html = result.Html;
string cookie = result.Cookie;
return html;
}
}
}
|