本帖最后由 qq58452077 于 2019-8-12 14:57 编辑
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.ExceptionUtils;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import cn.edu.hfut.dmic.webcollector.util.MD5Utils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.util.CollectionUtils;
import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Crawling news from hfut news
*
* @AuThor hu
*/
public class GuZhuangCrawler extends BreadthCrawler {
/**
* @Param crawlPath crawlPath is the path of the directory which maintains
* information of this crawler
* @param autoParse if autoParse is true,BreadthCrawler will auto extract
* links which match regex rules from pag
*/
File baseDir = new File("E:\\home");
public GuZhuangCrawler(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
setThreads(50);
getConf().setExecuteInterval(5000);
getConf().setConnectTimeout(0);
getConf().setReadTimeout(0);
getConf().setTopN(100);
//getConf().getDefaultUserAgent();
//setResumable(true);
}
public void visit(Page page, CrawlDatums next) {
String url = page.url();
/*if page is news page*/
if (page.matchType("list")) {
/*extract title and content of news by css selector*/
Elements eList = page.select("div#threadlist > div.sk-box > div.bd > div.sk-vlist > div");
for (Element element : eList) {
Element link = element.select("div.v-link > a").first();
if(link!=null){
String title = link.attr("title");
String href = link.attr("href");
System.out.println("URL:" + url);
System.out.println("title:" + title);
System.out.println("href:" + href);
next.addAndReturn(href).type("img").meta("title", title);
}
}
}
if (page.matchType("img")) {
String title = page.meta("title");
String regex = "[\u4e00-\u9fa5]+";
Matcher m = Pattern.compile(regex).matcher(title);
StringBuffer str = new StringBuffer();
while (m.find()){
str.append(m.group());
}
title = str.toString();
Elements eList = page.select("div#postlist > div");
for(int i=0;i<eList.size()-1;i++){
Elements links = null;
if(i==0){
if(CollectionUtils.isEmpty(links)){
links = eList.get(i).select("div#jiathis_share_CODE_HTML4 > div.t_fsz >table > tbody > tr:nth-child(1) >td.t_f >ignore_js_op");
}
if(CollectionUtils.isEmpty(links)){
links = eList.get(i).select("#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f >div:nth-last-of-type(1) > font > font > font > ignore_js_op");
}
if(CollectionUtils.isEmpty(links)){
links = eList.get(i).select("#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f > ignore_js_op");
}
if(CollectionUtils.isEmpty(links)){
links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) > td.t_f > div > ignore_js_op");
}
}else {
if(CollectionUtils.isEmpty(links)){
links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) >td.t_f > ignore_js_op");
}
if(CollectionUtils.isEmpty(links)){
links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) > td.t_f > div > ignore_js_op");
}
}
for(Element e:links){
Element link = e.select("img").first();
String src = link.attr("zoomfile");
String aid = link.attr("aid");
System.out.println("title:\n" + title);
System.out.println("content:\n" + src);
next.addAndReturn("http://www.52guzhuang.com/"+src).type("downloadImg").meta("title", title).meta("name",aid);
}
//#jiathis_share_CODE_HTML4
//Element link = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) >td.t_f > ignore_js_op:nth-child(1) >img").first();
}
Element ele = page.select("div#ct > div.pgs.mtm.mbm.cl > div.pg > a:nth-last-child(3)").first();
//Element ele1 = page.select("div#ct > div.pgs.mtm.mbm.cl > div.pg > a").get(-2);
if(null!=ele){
String pageNum = ele.text();
if(null!=pageNum&& Integer.valueOf(pageNum)>1){
String subStr = url.substring(0,url.lastIndexOf("-") -1);
for (int i = 2; i<=Integer.valueOf(pageNum); i++) {
next.addAndReturn(subStr+i+"-1.html").type("img").meta("title", title);
}
}
}
}
if (page.matchType("downloadImg")){
String title = page.meta("title");
//根据http头中的Content-Type信息来判断当前资源是网页还是图片
String contentType = page.contentType();
//根据Content-Type判断是否为图片
if(contentType!=null && contentType.startsWith("image")){
//从Content-Type中获取图片扩展名
String extensionName=contentType.split("/")[1];
try {
byte[] image = page.content();
//根据图片MD5生成文件名
String fileName = String.format("%s.%s", MD5Utils.md5(image), extensionName);
File imageFile = new File(baseDir.getPath()+"\\"+title,fileName);
FileUtils.write(imageFile,image);
System.out.println("保存图片 "+page.url()+" 到 "+ imageFile.getAbsolutePath());
} catch (Exception e) {
ExceptionUtils.fail(e);
}
}
}
}
public static void main(String[] args) throws Exception {
GuZhuangCrawler crawler = new GuZhuangCrawler("crawl", true);
/*可以设置每个线程visit的间隔,这里是毫秒*/
//crawler.setVisitInterval(1000);
/*可以设置http请求重试的间隔,这里是毫秒*/
//crawler.setRetryInterval(1000);
for (int i = 1; i <= 1; i++) {
String seedUrl = "http://www.52guzhuang.com/forum-59-"+i+".html";
crawler.addSeedAndReturn(seedUrl).meta("pageNum", i).type("list");
}
crawler.start(6);
}
}
汉服小姐姐
使用教程:
1.pom.xml文件添加依赖
<dependencies>
<dependency>
<groupId>cn.edu.hfut.dmic.webcollector</groupId>
<artifactId>WebCollector</artifactId>
<version>2.71</version>
</dependency>
</dependencies>
2.https://github.com/CrawlScript/WebCollector |