好友
阅读权限25
听众
最后登录1970-1-1
|
三木猿
发表于 2020-9-9 15:29
本帖最后由 三木猿 于 2020-9-10 09:17 编辑
之前的版本有bug,因为爬取www.biquge.com需要在网上下载的安全证书,比较麻烦,所以更新了一版,这个默认忽略安全证书
新版本可以选择下载数量,照顾小白不会改代码
新版本:https://wws.lanzouj.com/iAEMoghsgeb 密码:7vjz
jar包:https://wws.lanzouj.com/ilphyghsgcj密码:f38a
至于好不好用,反正我是下不完的,我的云服务器一两天就下满了优先导包
[XML] 纯文本查看 复制代码 <dependency>
<!-- jsoup HTML parser library [url=home.php?mod=space&uid=402414]@[/url] https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
这个是核心下载的代码
[Java] 纯文本查看 复制代码 package com.aaa.data;
import com.aaa.config.SSLHelper;
import com.aaa.dto.BookCatalogueDto;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author 三木猿
* @version 1.0
* @Title:
* @date 2020/8/10 15:16
*/
public class DownloadBook {
private static String dataSource;
private static Pattern pattern = Pattern.compile("<a\\s*href=\"?([\\w\\W]*?)\"?[\\s]*?[^>]>([\\s\\S]*?)(?=</a>)");
public static void setDataSource(String dataSource,int count) {
SSLHelper.init();
DownloadBook.dataSource = dataSource;
if ("biquge5200".equals(dataSource)) {
while (true) {
Thread thread1 = new Thread(() -> {
for (int i = 1; i <(count>=2?count/2:count); i++) {
try {
String bookCod = "0_" + i;
Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
System.out.println("---------------" + bookName + "正在下载" + "--------------");
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
return;
}
}
});
assert count>=2;
Thread thread2 = new Thread(() -> {
for (int i = count/2; i < count; i++) {
try {
i++;
String bookCod = "0_" + i;
Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
System.out.println("---------------" + bookName + "正在下载" + "--------------");
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
return;
}
}
});
thread1.start();
thread2.start();
try {
thread1.join();
thread2.join();
break;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
} else if ("biquge".equals(dataSource)) {
while (true) {
Thread thread1 = new Thread(() -> {
for (int j = 1; j < count/2; j++) {
try {
String bookCod = "0_" + j;
Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
System.out.println("---------------" + bookName + "正在下载" + "--------------");
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
continue;
}
}
});
Thread thread2 = new Thread(() -> {
for (int j = count/2; j < count; j++) {
try {
String bookCod = "0_" + j;
Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get();
Element info = document.getElementById("info");
String bookName = info.select("h1").text();
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
continue;
}
List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern);
System.out.println("---------------" + bookName + "正在下载" + "--------------");
downloadBook(bookCod, bookName, bookCatalogue);
System.out.println("---------------" + bookName + "下载完成" + "--------------");
} catch (Exception e) {
continue;
}
}
});
thread1.start();
thread2.start();
try {
thread1.join();
thread2.join();
break;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
public static void downloadBook(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws Exception {
String path = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(path);
if (file.exists()) {
return;
}
Map<Integer, List<BookCatalogueDto>> integerListMap = splitList(bookCatalogueDto, 3);
long start = System.currentTimeMillis();
Thread thread1 = new Thread(() -> {
try {
if ("biquge5200".equals(dataSource)) {
biquge5200(bookCod, bookName + "1", integerListMap.get(0));
} else if ("biquge".equals(dataSource)) {
biquge(bookCod, bookName + "1", integerListMap.get(0));
}
} catch (Exception e) {
e.printStackTrace();
}
});
Thread thread2 = new Thread(() -> {
try {
if ("biquge5200".equals(dataSource)) {
biquge5200(bookCod, bookName + "2", integerListMap.get(1));
} else if ("biquge".equals(dataSource)) {
biquge(bookCod, bookName + "2", integerListMap.get(1));
}
} catch (Exception e) {
e.printStackTrace();
}
});
Thread thread3 = new Thread(() -> {
try {
if ("biquge5200".equals(dataSource)) {
biquge5200(bookCod, bookName + "3", integerListMap.get(2));
} else if ("biquge".equals(dataSource)) {
biquge(bookCod, bookName + "3", integerListMap.get(2));
}
} catch (Exception e) {
e.printStackTrace();
}
});
thread1.start();
thread2.start();
thread3.start();
thread1.join();
thread2.join();
thread3.join();
//合并文件
combine(bookName);
long end = System.currentTimeMillis();
System.out.println("本次下载共用时" + (end - start));
}
public static void biquge5200(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws
Exception {
String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
File file = new File(path);
if (!file.exists()) {
File dir = new File(file.getParent());
dir.mkdirs();
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
} else {
List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName);
if (bookCatalogueDtos.size() != 0) {
BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1);
for (BookCatalogueDto catalogueDto : bookCatalogueDto) {
if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) {
int i = bookCatalogueDto.indexOf(catalogueDto);
bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size());
break;
}
}
}
}
//创建一个输出流,将爬到的小说以txt形式保存在硬盘
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
if(bookCatalogueDto.size()==0){
return;
}
bookCatalogueDto.forEach(e -> {
Document document = null;
try {
document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get();
} catch (IOException ioException) {
try {
Thread.sleep(5000);
try {
document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get();
} catch (IOException exception) {
return;
}
} catch (InterruptedException interruptedException) {
interruptedException.printStackTrace();
}
}
Elements chapterName = document.select("h1");
try {
bw.write(chapterName.text());
bw.newLine();
bw.flush();
} catch (IOException ioException) {
ioException.printStackTrace();
}
Elements elements = document.select("#content");
String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", "");
String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", "");
try {
String[] split = replace.replace("<p>", "").split("</p>");
for (String s : split) {
bw.write(s);
bw.newLine();
bw.flush();
}
} catch (IOException ioException) {
ioException.printStackTrace();
}
});
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static List<BookCatalogueDto> getBookCatalogue(String bookCod, Document document, Pattern pattern) throws InterruptedException {
List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
Elements dd = document.getElementsByTag("dd");
Map<Integer, List<Element>> integerListMap = splitList(dd, 3);
final List<BookCatalogueDto>[] bookCatalogueDtos1 = new List[]{new ArrayList<>()};
final List<BookCatalogueDto>[] bookCatalogueDtos2 = new List[]{new ArrayList<>()};
final List<BookCatalogueDto>[] bookCatalogueDtos3 = new List[]{new ArrayList<>()};
Thread thread1 = new Thread(() -> {
bookCatalogueDtos1[0] = get(integerListMap.get(0), bookCod, document, pattern);
});
Thread thread2 = new Thread(() -> {
bookCatalogueDtos2[0] = get(integerListMap.get(1), bookCod, document, pattern);
});
Thread thread3 = new Thread(() -> {
bookCatalogueDtos3[0] = get(integerListMap.get(2), bookCod, document, pattern);
});
thread1.start();
thread2.start();
thread3.start();
thread1.join();
thread2.join();
thread3.join();
bookCatalogueDtos.addAll(bookCatalogueDtos1[0]);
bookCatalogueDtos.addAll(bookCatalogueDtos2[0]);
bookCatalogueDtos.addAll(bookCatalogueDtos3[0]);
return bookCatalogueDtos;
}
public static List<BookCatalogueDto> get(List<Element> dd, String bookCod, Document document, Pattern pattern) {
List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
for (int i = 0; i < dd.size(); i++) {
Element element = dd.get(i);
BookCatalogueDto bookCatalogueDto = new BookCatalogueDto();
Node node = element.childNode(0);
for (Node e : element.childNodes()) {
if (!"".equals(e.toString())) {
node = e;
}
}
String s1 = node.toString();
Matcher matcher = pattern.matcher(s1);
if (matcher.find()) {
String nameCodeUrl = matcher.group(1);
String insStr = nameCodeUrl.substring(nameCodeUrl.lastIndexOf("/") + 1, nameCodeUrl.lastIndexOf("."));
bookCatalogueDto.setCatalogueCod(Integer.parseInt(insStr));
}
bookCatalogueDto.setBookCod(bookCod);
bookCatalogueDto.setCatalogueName(element.text());
bookCatalogueDtos.add(bookCatalogueDto);
}
return bookCatalogueDtos;
}
private static void biquge(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws FileNotFoundException {
String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
File file = new File(path);
if (!file.exists()) {
File dir = new File(file.getParent());
dir.mkdirs();
try {
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
} else {
List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName);
if (bookCatalogueDtos.size() != 0) {
BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1);
for (BookCatalogueDto catalogueDto : bookCatalogueDto) {
if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) {
int i = bookCatalogueDto.indexOf(catalogueDto);
bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size());
break;
}
}
}
}
//创建一个输出流,将爬到的小说以txt形式保存在硬盘
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
if(bookCatalogueDto.size()==0){
return;
}
bookCatalogueDto.forEach(e -> {
Document document = null;
try {
document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get();
} catch (Exception e1) {
try {
Thread.sleep(5000);
document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get();
} catch (InterruptedException interruptedException) {
interruptedException.printStackTrace();
} catch (Exception exception) {
exception.printStackTrace();
}
}
Elements chapterName = document.select("h1");
try {
bw.write(chapterName.text());
bw.newLine();
bw.flush();
} catch (IOException ioException) {
ioException.printStackTrace();
}
Elements elements = document.select("#content");
String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", "");
String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", "");
try {
String[] split = replace.split("<br>");
for (String s : split) {
bw.write(s);
bw.newLine();
bw.flush();
}
} catch (IOException ioException) {
ioException.printStackTrace();
}
});
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static <T> Map<Integer, List<T>> splitList(List<T> t, int num) {
Map<Integer, List<T>> subList = new HashMap<>();
int num1 = (int) Math.floor(t.size() / num);
for (int i = 0; i < num; i++) {
subList.put(i, t.subList(i * num1, (i + 1) * num1));
if (i == num - 1) {
subList.put(i, t.subList(i * num1, t.size()));
}
}
return subList;
}
public static void combine(String bookName) throws Exception {
String bookPath = "/usr/local/webapps/file/" + bookName + ".txt";
File file = new File(bookPath);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true)));
for (int i = 1; i < 4; i++) {
String path = "/usr/local/webapps/file/downloading/" + bookName + i + ".txt";
File file1 = new File(path);
if (file1.exists()) {
BufferedReader br = new BufferedReader(new FileReader(file1));
String line;
while (true) {
if (!((line = br.readLine()) != null)) {
br.close();
break;
}
bw.write(line);
bw.newLine();
}
}
file1.delete();
}
bw.flush();
bw.close();
}
public static List<BookCatalogueDto> txtCatalogue(String bookName) {
List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>();
String fileNamedirs = "/usr/local/webapps/file/downloading/" + bookName + ".txt";
try {
// 编码格式
String encoding = "utf-8";
// 文件路径
File file = new File(fileNamedirs);
if (file.isFile() && file.exists()) { // 判断文件是否存在
// 输入流
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考虑到编码格
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
Long count = (long) 0;
boolean bflag = false;
int n = 0;
String newStr = null;
String titleName = null;
String newChapterName = null;//新章节名称
String substring = null;
int indexOf = 0;
int indexOf1 = 0;
int line = 0;
//小说内容类
BookCatalogueDto content;
while ((lineTxt = bufferedReader.readLine()) != null) {
content = new BookCatalogueDto();
//小说名称
content.setBookName(bookName);
count++;
// 正则表达式
Pattern p = Pattern.compile("(^\\s*第)(.{1,9})[章节卷集部篇回](\\s{1})(.*)($\\s*)");
Matcher matcher = p.matcher(lineTxt);
newStr = newStr + lineTxt;
while (matcher.find()) {
titleName = matcher.group();
//章节去空
newChapterName = titleName.trim();
//获取章节
//System.out.println(newChapterName);
content.setCatalogueName(newChapterName);
indexOf1 = indexOf;
//System.out.println(indexOf);
indexOf = newStr.indexOf(newChapterName);
// System.out.println(newChapterName + ":" + "第" + count + "行"); // 得到返回的章
if (bflag) {
bflag = false;
break;
}
if (n == 0) {
indexOf1 = newStr.indexOf(newChapterName);
}
n = 1;
bflag = true;
//System.out.println(chapter);
bookCatalogueDtos.add(content);
}
}
bufferedReader.close();
} else {
System.out.println("找不到指定的文件");
}
} catch (Exception e) {
System.out.println("读取文件内容出错");
e.printStackTrace();
}
return bookCatalogueDtos;
}
}
工具类,爬取之前调用可以忽略安全证书
[Java] 纯文本查看 复制代码 package com.aaa.config;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.X509TrustManager;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
public class SSLHelper {
public static String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 5.0)";
static public void init() {
try {
SSLContext context = SSLContext.getInstance("TLS");
context.init(null, new X509TrustManager[]{new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
}}, new SecureRandom());
HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
} catch (NoSuchAlgorithmException e) {
} catch (KeyManagementException e) {
}
}
}
实体类
[Java] 纯文本查看 复制代码 package com.aaa.dto;
import com.aaa.entity.BookCatalogue;
/**
* @author 杨森
* @version 1.0
* @Title: BookCatalogue
* @date 2020/7/28 14:56
*/
public class BookCatalogueDto {
/**
* 书名
*/
private String bookName;
/**
* 章节id
*/
private Integer catalogueId;
/**
* 作者
*/
private String bookAuthor;
/**
* 章节名称
*/
private String catalogueName;
/**
* 章节编码
*/
private Integer catalogueCode;
/**
* 下一章code
*/
private Integer nextCode;
/**
* 上一章code
*/
private Integer upCode;
/**
* 书籍编码
*/
private String bookCode;
/**
* 书籍图片
*/
private String bookImage;
/**
* 书籍简介
*/
private String bookIntro;
/**
* 章节内容
*/
private String catalogueText;
public BookCatalogueDto(){}
public BookCatalogueDto(BookCatalogue bookCatalogue){
this.catalogueText=bookCatalogue.getCatalogueText();
this.catalogueCode=bookCatalogue.getCatalogueCode();
this.catalogueName=bookCatalogue.getCatalogueName();
this.nextCode=bookCatalogue.getNextCode();
}
public Integer getCatalogueCode() {
return catalogueCode;
}
public void setCatalogueCode(Integer catalogueCode) {
this.catalogueCode = catalogueCode;
}
public Integer getUpCode() {
return upCode;
}
public void setUpCode(Integer upCode) {
this.upCode = upCode;
}
public Integer getCatalogueId() {
return catalogueId;
}
public void setCatalogueId(Integer catalogueId) {
this.catalogueId = catalogueId;
}
public String getBookAuthor() {
return bookAuthor;
}
public void setBookAuthor(String bookAuthor) {
this.bookAuthor = bookAuthor;
}
public Integer getNextCode() {
return nextCode;
}
public void setNextCode(Integer nextCode) {
this.nextCode = nextCode;
}
public String getBookCode() {
return bookCode;
}
public void setBookCode(String bookCode) {
this.bookCode = bookCode;
}
public String getBookName() {
return bookName;
}
public void setBookName(String bookName) {
this.bookName = bookName;
}
public String getCatalogueName() {
return catalogueName;
}
public void setCatalogueName(String catalogueName) {
this.catalogueName = catalogueName;
}
public Integer getCatalogueCod() {
return catalogueCode;
}
public void setCatalogueCod(Integer catalogueCod) {
this.catalogueCode = catalogueCod;
}
public String getBookCod() {
return bookCode;
}
public void setBookCod(String bookCod) {
this.bookCode = bookCod;
}
public String getBookImage() {
return bookImage;
}
public void setBookImage(String bookImage) {
this.bookImage = bookImage;
}
public String getBookIntro() {
return bookIntro;
}
public void setBookIntro(String bookIntro) {
this.bookIntro = bookIntro;
}
public String getCatalogueText() {
return catalogueText;
}
public void setCatalogueText(String catalogueText) {
this.catalogueText = catalogueText;
}
}
|
免费评分
-
查看全部评分
|