[Java] 纯文本查看 复制代码
//正则抓取内容
@Test
void test12() {
// String pinyin="āáǎàēéěèīíǐìōóǒA8B0òūúǔùǖǘǚǜüê";
String ch_punctuation="~\\u000A\\u0009\\u00A0\\u0020\\u3000";//匹配一些特殊的字符
// String punctuation="[\\-,\\/,\\|,\\$,\\+,\\%,\\&,\\',\\(,\\),\\*,\\x20-\\x2f,\\x3a-\\x40,\\x5b-\\x60,\\x7b-\\x7e,\\x80-\\xff,\\u3000-\\u3002,\\u300a,\\u300b,\\u300e-\\u3011,\\u2014,\\u2018,\\u2019,\\u201c,\\u201d,\\u2026,\\u203b,\\u25ce,\\uff01-\\uff5e,\\uffe5]";
// String eh_punctuation="\\u003A\\u0028\\u201C\\uFF0C\\uFF1F\\u3001\\u201D\\uFF01\\uFF1A\\u223C\\u003D\\u2026";
String unicode_azAZ09="\\uFF41-\\uFF5a\\uFF21-\\uFF3a\\uFF10-\\uFF19";
String chinese="\\u4E00-\\u9FFF";//匹配中文正则
String html = "";
try {
html = SpiderUtils.getSource("https://www.xbiquge6.com/9_9208/9120869.html");
// System.out.println(html);
} catch (Exception e) {
e.printStackTrace();
}
Pattern compile = Pattern.compile("[pvr/\"]>[^字\\w<*][\\pP\\w\\pN\\pL\\pM"
+unicode_azAZ09+chinese+ch_punctuation
+ "]{3,}[^字\\w>]{0,2}(<br|</p|</d|<p)");
Matcher m=compile.matcher(html);
while(m.find()) {
String reString=m.group(0).replace("\r\n", "").replace("<br", "\n").replace("</p", "\n")
.replace("p>", "\n").replaceAll("&[a-z]{3,6};", "").replace("\n", "").replace("<p", "\n")
.replace("/>", "").replace("r>", "").replace(" ","").replace("</d","").replace("v>","")
.replace("\">", "").replace(" ", "").trim();
if(reString.length()>0) {
System.out.println(reString);
}
}
}