本人喜爱看小说,于是便写了个采集代码
采集队列,保存采集到的章节路径
[Java] 纯文本查看 复制代码 package com.test3;
import java.util.LinkedList;
import java.util.Queue;
public class UrlQueue {
private static Queue queue=new LinkedList();
//入队
public static void addQueue(Object o){
queue.add(o);
}
//出队
public static Object polQueue(){
return queue.poll();
}
//是否为空
public static boolean isEmpty(){
return queue.isEmpty();
}
//是否存在该元素
public static boolean contains(Object o){
return queue.contains(o);
}
}
文件下载类,将采集到的文本信息写入文件中
[Java] 纯文本查看 复制代码 package com.test3;
import java.io.FileWriter;
import java.io.IOException;
public class DownlodFile {
public static void write(String name,String content){
try {
FileWriter fw=new FileWriter(name+".txt", true);
fw.write(content);
fw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
创建网页请求工具类
[Java] 纯文本查看 复制代码 package com.test3;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import org.apache.http.HttpEntity;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.http.protocol.HttpContext;
public class HttpClientTool {
/**
* 使用ssl通道并设置请求重试处理
* @return
*/
public static CloseableHttpClient createSSLClientDefault() {
try {
//创建ssl上下文对象
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
//信任所有
public boolean isTrusted(X509Certificate[] chain,String authType) throws CertificateException {
return true;
}
}).build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
//设置请求重试处理,重试机制,这里如果请求失败会重试5次
HttpRequestRetryHandler retryHandler = new HttpRequestRetryHandler() {
@Override
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (executionCount >= 5) {
// Do not retry if over max retry count
return false;
}
if (exception instanceof InterruptedIOException) {
// Timeout
return false;
}
if (exception instanceof UnknownHostException) {
// Unknown host
return false;
}
if (exception instanceof ConnectTimeoutException) {
// Connection refused
return false;
}
if (exception instanceof SSLException) {
// SSL handshake exception
return false;
}
HttpClientContext clientContext = HttpClientContext.adapt(context);
HttpRequest request = clientContext.getRequest();
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// Retry if the request is considered idempotent
return true;
}
return false;
}
};
//请求参数设置,设置请求超时时间为20秒,连接超时为10秒,不允许循环重定向
RequestConfig requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(20000).setConnectTimeout(20000)
.setCircularRedirectsAllowed(false)
.build();
Cookie cookie ;
//设置消息头
LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy();
return HttpClients.custom().setSSLSocketFactory(sslsf)
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36")
.setMaxConnPerRoute(25).setMaxConnPerRoute(256)
.setRetryHandler(retryHandler)
.setRedirectStrategy(redirectStrategy)
.setDefaultRequestConfig(requestConfig)
.build();
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyStoreException e) {
e.printStackTrace();
}
return HttpClients.createDefault();
}
public static HttpEntity getHttpEntity(String url){
//创建httpclient
CloseableHttpClient httpclient=createSSLClientDefault();
//使用get方式请求链接
HttpGet get=new HttpGet(url);
HttpEntity entity=null;
try {
HttpResponse response=httpclient.execute(get);
int statusCode=response.getStatusLine().getStatusCode();
if(statusCode!=HttpStatus.SC_OK){
System.out.println("Method failed:"+response.getStatusLine());
}else{
entity=response.getEntity();
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return entity;
}
}
小说正则工具类
[Java] 纯文本查看 复制代码 package com.test3;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HtmlRegex {
//章节内容正则
public static String content="<div id=\"content\">(.*?)</div>";
//分章节正则
public static String part="<dd><a href=\"(.*?)\">(.*?)</a></dd>";
//获取小说名正则
public static String name="<div id=\"info\"><h1>(.*?)</h1>.*</div>";
/**
* 通过传输的网页源代码获取章节内容
* @param cont
* @return
*/
public static String getContent(String cont){
Pattern p=Pattern.compile(content);
Matcher ma=p.matcher(cont);
String str=null;
while(ma.find()){
str=ma.group(1);
}
str=str.replace(" ", "");
str=str.replace("<br />", "\r\n");
str=str.replace("www.lingdiankanshu.com", "");
return str;
}
/**
* 通过网页源代码获取小说姓名
* @param cont
* @return
*/
public static String getName(String cont){
//System.out.println(cont);
Pattern p=Pattern.compile(name);
Matcher ma=p.matcher(cont);
String str=null;
while(ma.find()){
str=ma.group(1);
}
System.out.println(str);
return str;
}
/**
* 获取章节目录传入list集中
* @param cont
* @return
*/
public static List<String> getPart(String cont){
List<String> list=new ArrayList<String>();
Pattern pattern=Pattern.compile(part);
Matcher matcher=pattern.matcher(cont);
while(matcher.find()){
list.add(matcher.group(1));
//UrlQueue.addQueue(matcher.group(1));
}
return list;
}
//测试代码
public static void main(String[] args) {
String name=getName("<div id=\"maininfo\"><div id=\"info\"><h1>斗破苍穹</h1><p>作 者:天蚕土豆</p><p>动 作:<a href=\"javascript:;\" onclick=\"showpop('/modules/article/addbookcase.php?bid=3133&ajax_request=1');\">加入书架</a>, <a href=\"javascript:;\" onclick=\"showpop('/modules/article/uservote.php?id=3133&ajax_request=1\');\">投推荐票</a>, <a href=\"#footer\">直达底部</a></p></div>");
System.out.println(name);
}
}
主程序
[Java] 纯文本查看 复制代码 package com.test3;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.util.EntityUtils;
public class Text {
public static void getText(String url){
//获取url地址实体类
HttpEntity entity= HttpClientTool.getHttpEntity(url);
try {
//设置网页编码格式为GBK,得到网页源代码
String partHtml=EntityUtils.toString(entity, "GBK");
String name=HtmlRegex.getName(partHtml);
//System.out.println(name);
//将网页源代码调用正则工具类,得到章节目录
List<String> urlList=HtmlRegex.getPart(partHtml);
//将章节目录添加至采集队列
for(String st:urlList){
//System.out.println(st);
UrlQueue.addQueue(st);
}
int coun=0;
int len=urlList.size();
System.out.println("共有"+len+"章");
NumberFormat numberFormat=NumberFormat.getInstance();
numberFormat.setMaximumFractionDigits(2);
//开始对采集队列进行采集
while(!UrlQueue.isEmpty()){
coun++;
String re=numberFormat.format((double)coun/(double)len*100);
System.out.println("当前进度:"+re+"%");
String uri=url+(String)UrlQueue.polQueue();
HttpEntity he=HttpClientTool.getHttpEntity(uri);
String content=EntityUtils.toString(he, "GBK");
//System.out.println(content);
//将采集好的文本消息传入文件下载类
DownlodFile.write(name,HtmlRegex.getContent(content));
}
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("SUCCESS");
}
public static void main(String[] args) {
getText("http://www.lingdiankanshu.com/html/3/3133/");
}
}
将需要采集的小说调用Text类中main方法就行了
效果图
|