[C#] 纯文本查看 复制代码
private void StartCrawling_Click(object sender, EventArgs e)
{
if (StartCrawling.Text == "开始爬取")
{
List<CheckBox> selected = new List<CheckBox>();
foreach (var item in selection) if (item.Checked) selected.Add(item);
if (CrawlingQuantity.Text == "" && selected.Count != 0)
{
MessageBox.Show("未选中爬取项或者未输入爬取数量,爬取数量:0 为一直爬取");
return;
}
StartCrawling.Text = "停止爬取";
int totalnum = int.Parse(CrawlingQuantity.Text);
if (totalnum == 0) totalnum = 999999999;
progressBar1.Maximum = totalnum;
GetTotalUrl(totalnum,selected);
}
else
{
StartCrawling.Text = "开始爬取";
}
}
private string DownloadHtml(string url, string referer)
{
string result = "";
HttpWebRequest request;
try
{
request = (HttpWebRequest)HttpWebRequest.Create(url);
request.Referer = referer;
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream);
result = reader.ReadToEnd();
stream.Close();
reader.Close();
}
catch (Exception ex)
{
MessageBox.Show(ex.ToString());
}
return result;
}
private void GetTotalUrl(int num, List<CheckBox> selection)
{
if (StartCrawling.Text == "开始爬取") return;
int avgnum = num / selection.Count;
int reminder = num % selection.Count;
foreach (var item in selection)
{
string url = "";
int realnum = avgnum;
classification.TryGetValue(item.Text, out url);
if (reminder > 0)
{
realnum++;
reminder--;
}
if (int.Parse(BeginPage.Text) > 1 && BeginPage.Text != "") url += "page/" + BeginPage.Text;
GetUrl(realnum, url);
}
progressBar1.Value = 0;
MessageBox.Show("爬取完毕!");
StartCrawling.Text = "开始爬取";
}
private void GetUrl(int totalnum,string url)
{
int num = 0;
string htmlCode = DownloadHtml(url, "https://www.mzitu.com/");
if (htmlCode == "") return;
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(htmlCode);
HtmlNodeCollection postlist = doc.DocumentNode.SelectNodes("//div[starts-with(@class,'postlist')]");
HtmlNodeCollection pins = postlist[0].SelectSingleNode("ul").SelectNodes("li");
string rootPath = System.Environment.CurrentDirectory + "\\" + DateTime.Now.ToString("yyyyMMddHHmmss") + "\\"; //保存路径为:程序所在目录+开始爬取时的时间
foreach (var li in pins)
{
if (StartCrawling.Text == "开始爬取") return;
if (num == totalnum) break;
if (li.GetAttributeValue("class", "" != "box"))
{
string path = rootPath + li.SelectSingleNode("a").SelectSingleNode("img").GetAttributeValue("alt", "");
if (!Directory.Exists(path)) Directory.CreateDirectory(path);
DownloadImg(path, li.SelectSingleNode("a").GetAttributeValue("href", ""), li.SelectSingleNode("a").GetAttributeValue("href", ""),url);
progressBar1.Value++;
num++;
}
}
if (num < totalnum)
{
HtmlNode navlinks = doc.DocumentNode.SelectSingleNode("//a[starts-with(@class,'next page-numbers')]");
if (navlinks == null) return;
GetUrl(totalnum - num, navlinks.GetAttributeValue("href", ""));
}
}
private void DownloadImg(string path, string url, string f_url,string referer)
{
if (StartCrawling.Text == "开始爬取") return;
string htmlCode = DownloadHtml(url,referer);
if (htmlCode == "") return;
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(htmlCode);
HtmlNode mainimage = doc.DocumentNode.SelectSingleNode("//div[starts-with(@class,'main-image')]").SelectSingleNode("p").SelectSingleNode("a").SelectSingleNode("img");
string imgurl = mainimage.GetAttributeValue("src", "");
HttpWebRequest request;
try
{
request = (HttpWebRequest)HttpWebRequest.Create(imgurl);
request.Referer = referer;
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
request.ContentType = "image/jpeg";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
Image img = Image.FromStream(stream);
img.Save(path + "\\" + doc.DocumentNode.SelectSingleNode("//h2[starts-with(@class,'main-title')]").InnerText + ".jpg");
stream.Close();
}
catch
{
MessageBox.Show("软件请求过于频繁被服务器拒绝,请停止爬取后调高爬取间隔。");
return;
}
HtmlNodeCollection pagenavi = doc.DocumentNode.SelectSingleNode("//div[starts-with(@class,'pagenavi')]").SelectNodes("span");
int page = 0;
foreach (var item in pagenavi) if (item.Attributes.Count == 0) page = int.Parse(item.InnerText);
page++;
string new_url = f_url + "/" + page.ToString();
float sleepTime = float.Parse(Interval.Text) * 1000;
System.Threading.Thread.Sleep((int)sleepTime);
Application.DoEvents();
if (doc.DocumentNode.SelectSingleNode("//a[starts-with(@href,'" + new_url + "')]") != null) DownloadImg(path, new_url, f_url,url);
}