webcollector框架爬虫古装汉服图片

qq58452077 发表于 2019-8-12 14:51

本帖最后由 qq58452077 于 2019-8-12 14:57 编辑

```java

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.ExceptionUtils;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import cn.edu.hfut.dmic.webcollector.util.MD5Utils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.util.CollectionUtils;

import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Crawling news from hfut news
*
* @AuThor hu
*/
public class GuZhuangCrawler extends BreadthCrawler {
/**
* @Param crawlPath crawlPath is the path of the directory which maintains
*                information of this crawler
* @param autoParse if autoParse is true,BreadthCrawler will auto extract
*                links which match regex rules from pag
*/
File baseDir = new File("E:\\home");

public GuZhuangCrawler(String crawlPath, boolean autoParse) {
   super(crawlPath, autoParse);
   setThreads(50);
   getConf().setExecuteInterval(5000);
   getConf().setConnectTimeout(0);
   getConf().setReadTimeout(0);
   getConf().setTopN(100);
   //getConf().getDefaultUserAgent();
   //setResumable(true);
}

public void visit(Page page, CrawlDatums next) {
   String url = page.url();
   /*if page is news page*/
   if (page.matchType("list")) {
         /*extract title and content of news by css selector*/
         Elements eList = page.select("div#threadlist > div.sk-box > div.bd > div.sk-vlist > div");
         for (Element element : eList) {
            Element link = element.select("div.v-link > a").first();
            if(link!=null){
               String title = link.attr("title");
               String href = link.attr("href");
               System.out.println("URL:" + url);
               System.out.println("title:" + title);
               System.out.println("href:" + href);
               next.addAndReturn(href).type("img").meta("title", title);
            }
         }
   }
   if (page.matchType("img")) {
         String title= page.meta("title");
         String regex = "[\u4e00-\u9fa5]+";
         Matcher m = Pattern.compile(regex).matcher(title);
         StringBuffer str = new StringBuffer();
         while (m.find()){
            str.append(m.group());
         }
         title = str.toString();
         Elements eList = page.select("div#postlist > div");
         for(int i=0;i<eList.size()-1;i++){
            Elements links = null;
            if(i==0){
               if(CollectionUtils.isEmpty(links)){
                     links = eList.get(i).select("div#jiathis_share_CODE_HTML4 > div.t_fsz >table > tbody > tr:nth-child(1) >td.t_f >ignore_js_op");
               }
               if(CollectionUtils.isEmpty(links)){
                     links = eList.get(i).select("#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f >div:nth-last-of-type(1) > font > font > font > ignore_js_op");
               }
               if(CollectionUtils.isEmpty(links)){
                     links = eList.get(i).select("#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f > ignore_js_op");
               }
               if(CollectionUtils.isEmpty(links)){
                     links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) > td.t_f > div > ignore_js_op");
               }

            }else {
               if(CollectionUtils.isEmpty(links)){
                     links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) >td.t_f > ignore_js_op");
               }
               if(CollectionUtils.isEmpty(links)){
                     links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) > td.t_f > div > ignore_js_op");
               }

            }
            for(Element e:links){
               Element link = e.select("img").first();
               String src = link.attr("zoomfile");
               String aid = link.attr("aid");
               System.out.println("title:\n" + title);
               System.out.println("content:\n" + src);
               next.addAndReturn("http://www.52guzhuang.com/"+src).type("downloadImg").meta("title", title).meta("name",aid);
            }
            //#jiathis_share_CODE_HTML4
            //Element link = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) >td.t_f > ignore_js_op:nth-child(1) >img").first();

         }
         Element ele = page.select("div#ct > div.pgs.mtm.mbm.cl > div.pg > a:nth-last-child(3)").first();
         //Element ele1 = page.select("div#ct > div.pgs.mtm.mbm.cl > div.pg > a").get(-2);
         if(null!=ele){
            String pageNum = ele.text();
            if(null!=pageNum&& Integer.valueOf(pageNum)>1){
               String subStr = url.substring(0,url.lastIndexOf("-") -1);
               for (int i = 2; i<=Integer.valueOf(pageNum); i++) {
                     next.addAndReturn(subStr+i+"-1.html").type("img").meta("title", title);
               }
            }
         }
   }
   if (page.matchType("downloadImg")){
         String title= page.meta("title");
         //根据http头中的Content-Type信息来判断当前资源是网页还是图片
         String contentType = page.contentType();
         //根据Content-Type判断是否为图片
         if(contentType!=null && contentType.startsWith("image")){
            //从Content-Type中获取图片扩展名
            String extensionName=contentType.split("/");
            try {
               byte[] image = page.content();
               //根据图片MD5生成文件名
               String fileName = String.format("%s.%s", MD5Utils.md5(image), extensionName);
               File imageFile = new File(baseDir.getPath()+"\\"+title,fileName);
               FileUtils.write(imageFile,image);
               System.out.println("保存图片 "+page.url()+" 到 "+ imageFile.getAbsolutePath());
            } catch (Exception e) {
               ExceptionUtils.fail(e);
            }
         }
   }
}

public static void main(String[] args) throws Exception {
   GuZhuangCrawler crawler = new GuZhuangCrawler("crawl", true);
   /*可以设置每个线程visit的间隔，这里是毫秒*/
   //crawler.setVisitInterval(1000);
   /*可以设置http请求重试的间隔，这里是毫秒*/
   //crawler.setRetryInterval(1000);

   for (int i = 1; i <= 1; i++) {
         String seedUrl = "http://www.52guzhuang.com/forum-59-"+i+".html";
         crawler.addSeedAndReturn(seedUrl).meta("pageNum", i).type("list");
   }
   crawler.start(6);
}
}

```

使用教程：
1.pom.xml文件添加依赖
<dependencies>
   <dependency>
         <groupId>cn.edu.hfut.dmic.webcollector</groupId>
         <artifactId>WebCollector</artifactId>
         <version>2.71</version>
   </dependency>
</dependencies>
2.https://github.com/CrawlScript/WebCollector

maplefly 发表于 2019-8-12 15:50

借鉴一下写个 C#版的

xincarry 发表于 2019-8-12 15:58

不搞这个，但也来支持下吧，毕竟这两年慢慢流行了

Bii 发表于 2019-8-21 14:51

感谢楼主分享

niebaohua 发表于 2019-8-22 11:32

图片好评，不错

页: [1]

吾爱破解 - 52pojie.cn's Archiver

webcollector框架爬虫古装汉服图片