webcollector框架爬虫古装汉服图片

qq58452077 · 发表于 2019-8-12 14:51

本帖最后由 qq58452077 于 2019-8-12 14:57 编辑


import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.ExceptionUtils;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import cn.edu.hfut.dmic.webcollector.util.MD5Utils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.util.CollectionUtils;

import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Crawling news from hfut news
 *
 * @AuThor hu
 */
public class GuZhuangCrawler extends BreadthCrawler {
    /**
     * @Param crawlPath crawlPath is the path of the directory which maintains
     *                  information of this crawler
     * @param autoParse if autoParse is true,BreadthCrawler will auto extract
     *                  links which match regex rules from pag
     */
    File baseDir = new File("E:\\home");

    public GuZhuangCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        setThreads(50);
        getConf().setExecuteInterval(5000);
        getConf().setConnectTimeout(0);
        getConf().setReadTimeout(0);
        getConf().setTopN(100);
        //getConf().getDefaultUserAgent();
        //setResumable(true);
    }

    public void visit(Page page, CrawlDatums next) {
        String url = page.url();
        /*if page is news page*/
        if (page.matchType("list")) {
            /*extract title and content of news by css selector*/
            Elements eList = page.select("div#threadlist > div.sk-box > div.bd > div.sk-vlist > div");
            for (Element element : eList) {
                Element link = element.select("div.v-link > a").first();
                if(link!=null){
                    String title = link.attr("title");
                    String href = link.attr("href");
                    System.out.println("URL:" + url);
                    System.out.println("title:" + title);
                    System.out.println("href:" + href);
                    next.addAndReturn(href).type("img").meta("title", title);
                }
            }
        }
        if (page.matchType("img")) {
            String title  = page.meta("title");
            String regex = "[\u4e00-\u9fa5]+";
            Matcher m = Pattern.compile(regex).matcher(title);
            StringBuffer str = new StringBuffer();
            while (m.find()){
                str.append(m.group());
            }
            title = str.toString();
            Elements eList = page.select("div#postlist > div");
            for(int i=0;i<eList.size()-1;i++){
                Elements links = null;
                if(i==0){
                    if(CollectionUtils.isEmpty(links)){
                        links = eList.get(i).select("div#jiathis_share_CODE_HTML4 > div.t_fsz >table > tbody > tr:nth-child(1) >td.t_f >ignore_js_op");
                    }
                    if(CollectionUtils.isEmpty(links)){
                        links = eList.get(i).select("#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f >div:nth-last-of-type(1) > font > font > font > ignore_js_op");
                    }
                    if(CollectionUtils.isEmpty(links)){
                        links = eList.get(i).select("#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f > ignore_js_op");
                    }
                    if(CollectionUtils.isEmpty(links)){
                        links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) > td.t_f > div > ignore_js_op");
                    }

                }else {
                    if(CollectionUtils.isEmpty(links)){
                        links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) >td.t_f > ignore_js_op");
                    }
                    if(CollectionUtils.isEmpty(links)){
                        links = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) > td.t_f > div > ignore_js_op");
                    }

                }
                for(Element e:links){
                    Element link = e.select("img").first();
                    String src = link.attr("zoomfile");
                    String aid = link.attr("aid");
                    System.out.println("title:\n" + title);
                    System.out.println("content:\n" + src);
                    next.addAndReturn("http://www.52guzhuang.com/"+src).type("downloadImg").meta("title", title).meta("name",aid);
                }
                //#jiathis_share_CODE_HTML4
                //Element link = eList.get(i).select("table.plhin > tbody > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tbody > tr:nth-child(1) >td.t_f > ignore_js_op:nth-child(1) >img").first();

            }
            Element ele = page.select("div#ct > div.pgs.mtm.mbm.cl > div.pg > a:nth-last-child(3)").first();
            //Element ele1 = page.select("div#ct > div.pgs.mtm.mbm.cl > div.pg > a").get(-2);
            if(null!=ele){
                String pageNum = ele.text();
                if(null!=pageNum&& Integer.valueOf(pageNum)>1){
                    String subStr = url.substring(0,url.lastIndexOf("-") -1);
                    for (int i = 2; i<=Integer.valueOf(pageNum); i++) {
                        next.addAndReturn(subStr+i+"-1.html").type("img").meta("title", title);
                    }
                }
            }
        }
        if (page.matchType("downloadImg")){
            String title  = page.meta("title");
            //根据http头中的Content-Type信息来判断当前资源是网页还是图片
            String contentType = page.contentType();
            //根据Content-Type判断是否为图片
            if(contentType!=null && contentType.startsWith("image")){
                //从Content-Type中获取图片扩展名
                String extensionName=contentType.split("/")[1];
                try {
                    byte[] image = page.content();
                    //根据图片MD5生成文件名
                    String fileName = String.format("%s.%s", MD5Utils.md5(image), extensionName);
                    File imageFile = new File(baseDir.getPath()+"\\"+title,fileName);
                    FileUtils.write(imageFile,image);
                    System.out.println("保存图片 "+page.url()+" 到 "+ imageFile.getAbsolutePath());
                } catch (Exception e) {
                    ExceptionUtils.fail(e);
                }
            }
        }
    }

    public static void main(String[] args) throws Exception {
        GuZhuangCrawler crawler = new GuZhuangCrawler("crawl", true);
        /*可以设置每个线程visit的间隔，这里是毫秒*/
        //crawler.setVisitInterval(1000);
        /*可以设置http请求重试的间隔，这里是毫秒*/
        //crawler.setRetryInterval(1000);

        for (int i = 1; i <= 1; i++) {
            String seedUrl = "http://www.52guzhuang.com/forum-59-"+i+".html";
            crawler.addSeedAndReturn(seedUrl).meta("pageNum", i).type("list");
        }
        crawler.start(6);
    }
}

汉服小姐姐

使用教程：
1.pom.xml文件添加依赖
<dependencies>
      <dependency>
         <groupId>cn.edu.hfut.dmic.webcollector</groupId>
         <artifactId>WebCollector</artifactId>
         <version>2.71</version>
      </dependency>
</dependencies>
2.https://github.com/CrawlScript/WebCollector

maplefly · 发表于 2019-8-12 15:50

借鉴一下写个 C#版的

xincarry · 发表于 2019-8-12 15:58

不搞这个，但也来支持下吧，毕竟这两年慢慢流行了

Bii · 发表于 2019-8-21 14:51

感谢楼主分享

niebaohua · 发表于 2019-8-22 11:32

图片好评，不错

帐号		自动登录	找回密码
密码			注册[Register]

[Java 转载] webcollector框架爬虫古装汉服图片

免费评分