Java HTML解析器

henry307 发表于 2023-3-23 12:54

本帖最后由 henry307 于 2023-3-23 12:58 编辑

这是一个对HTML进行分析的快速实时的解析器，可以通过DOM或CSS选择器来查找，提取数据。下面例子展示此解析器的用法，例子还用到了上文提到的 Java爬虫引擎。package cfw.test;

import cfw.html.TagSearchRange;
import cfw.html.HtmlParser;
import cfw.html.HtmlTag;
import cfw.http.ResponseResult;
import cfw.http.UserAgentPack;
import cfw.http.WebClient;
import cfw.http.WebRequest;
import cfw.model.FinancialNewsListModel;
import com.alibaba.fastjson.JSONArray;
import com.sun.jndi.toolkit.url.Uri;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class HtmlParserTest {

public static void main(String[] args) {

   // 【新浪财经要闻文章列表】抓取
   List<FinancialNewsListModel> sinaNews = crawlSinaFinancialNewsList();
   System.out.println(String.format("【新浪财经-要闻列表页】抓取到%s个文章列表", sinaNews.size()));
   String json= JSONArray.toJSON(sinaNews).toString();
   System.out.println(json);

   // 【凤凰财经文章列表】抓取

   List<FinancialNewsListModel> ifengNews = crawlIFengFinancialNewsList();
   System.out.println(String.format("【凤凰财经-文章列表页】抓取到%s个文章列表", ifengNews.size()));
   String json2 = JSONArray.toJSON(ifengNews).toString();
   System.out.println(json2);

}

/**
* 【新浪财经要闻文章列表】抓取
*
* @return
*/
private static List<FinancialNewsListModel> crawlSinaFinancialNewsList() {
         /*
         抓取地址： http://finance.sina.com.cn/
         */

   List<FinancialNewsListModel> fnews = new ArrayList<FinancialNewsListModel>();
   try {
         String url = "http://finance.sina.com.cn/";
         WebRequest req = new WebRequest();
         req.setUrl(url);
         req.setMethod("GET");
         req.setUserAgent(UserAgentPack.getUserAgentRandom());
         Map<String, String> dic = new HashMap<String, String>();
         dic.put("Upgrade-Insecure-Requests", "1");
         dic.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
         dic.put("Accept-Encoding", "gzip, deflate");
         dic.put("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
         req.setSpecialHeadCollection(dic);
         ResponseResult sr = WebClient.download(req);
         String html = sr.getResponseHtmlStr();
         HtmlParser parser = new HtmlParser(html);
         HtmlTag htmlTag = parser.parse();

         // 要闻
         HtmlTag finTag = htmlTag.getElementById("fin_tabs0_c0");
         List<HtmlTag> aTags = finTag.getElementsByTagName("a");
         for (HtmlTag aTag : aTags) {
            FinancialNewsListModel list = new FinancialNewsListModel();
            list.setArticalUrl(aTag.getAttribute("href"));
            list.setArticalTitle(aTag.getValue());
            Uri uri = new Uri(list.getArticalUrl());
            String[] strs = uri.getPath().split("/");
            if (strs.length > 1) {
               list.setArticalMD("sina-" + strs.replace(".html", "").replace(".shtml", ""));
               fnews.add(list);
            }
         }
   } catch (Exception ex) {
         System.out.println("【新浪财经-要闻列表页】抓取失败" + ex.getMessage());
   }

   return fnews;
}

/**
* 【凤凰财经文章列表】抓取
*
* @return
*/
private static List<FinancialNewsListModel> crawlIFengFinancialNewsList() {
         /*
         抓取地址： http://finance.ifeng.com/
         */

   List<FinancialNewsListModel> fnews = new ArrayList<>();
   try {
         String url = "http://finance.ifeng.com/";
         WebRequest req = new WebRequest();
         req.setUrl(url);
         req.setMethod("GET");
         req.setUserAgent(UserAgentPack.getUserAgentRandom());
         Map<String, String> dic = new HashMap<>();
         dic.put("Upgrade-Insecure-Requests", "1");
         dic.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
         dic.put("Accept-Encoding", "gzip, deflate");
         dic.put("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
         req.setSpecialHeadCollection(dic);
         ResponseResult sr = WebClient.download(req);
         String html = sr.getResponseHtmlStr();
         HtmlParser parser = new HtmlParser(html);

         HtmlTag htmlTag = parser.parse();

         HtmlTag listTag = htmlTag.search(t -> t.getAttribute("class").equals("list-tab"), TagSearchRange.AllElements).get(0);
         List<HtmlTag> divTags = listTag.search(s -> s.getAttribute("class").equals("list_L z20 clearfix"), TagSearchRange.AllElements);
         for (HtmlTag divTag : divTags) {
            HtmlTag list_textTag = divTag.search(d -> d.getAttribute("class").equals("list_text"), TagSearchRange.AllElements).get(0);
            HtmlTag aTag = list_textTag.getElementsByTagName("a").get(0);

            FinancialNewsListModel list = new FinancialNewsListModel();
            list.setArticalUrl(aTag.getAttribute("href"));
            list.setArticalTitle(aTag.getValue());
            Uri uri = new Uri(list.getArticalUrl());
            String[] strs = uri.getPath().split("/");
            list.setArticalMD("ifeng-" + strs.replace(".html", "").replace(".shtml", ""));
            fnews.add(list);
         }

   } catch (Exception ex) {
         System.out.println("【凤凰财经-文章列表页】抓取失败" + ex.getMessage());
   }

   return fnews;
}

}运行结果如下：

最后源码下载。

pengze 发表于 2023-3-23 14:55

你这个看着有点像webmagic，一般来说用Jsoup这个jar包足够了，单纯讨论下，没别的意思:lol

Island96 发表于 2023-7-5 10:01

henry307 发表于 2023-7-5 09:50
好的，晚点我看下

import cfw.html.TagSearchRange;
import cfw.html.HtmlParser;
import cfw.html.HtmlTag
import cfw.http.UserAgentPack;
import cfw.model.FinancialNewsListModel;
您链接的那一版爬虫引擎少了这五个类

syh315 发表于 2023-3-23 16:53

学习了，谢谢分享

henry307 发表于 2023-3-23 19:07

pengze 发表于 2023-3-23 14:55
你这个看着有点像webmagic，一般来说用Jsoup这个jar包足够了，单纯讨论下，没别的意思
习惯了document，所以实现了方便用

huaxincanmeng 发表于 2023-3-30 15:43

jsoup也可以

henry307 发表于 2023-3-30 21:34

huaxincanmeng 发表于 2023-3-30 15:43
jsoup也可以

这个性能更高

jianghuai 发表于 2023-3-31 10:42

Island96 发表于 2023-7-5 09:39

大佬,你的源码下载好像没有链接哎,前一版大爬虫引擎少了一些东西

henry307 发表于 2023-7-5 09:50

Island96 发表于 2023-7-5 09:39
大佬,你的源码下载好像没有链接哎,前一版大爬虫引擎少了一些东西

好的，晚点我看下

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

Java HTML解析器