wallpapershome壁纸网站爬虫代码分享
本帖最后由 YunYi1021 于 2024-3-24 16:17 编辑最近换了新的笔记本用,以前下载的壁纸不想再用了,在论坛里面没有找到这个网站的可用工具,于是就简单写了一个,现在分享出来。
[*]代码中默认下载的为壁纸支持的最高分辨率文件。
[*]使用到的工具有hutool(主要用于观察文件实时下载进度)、jsoup(用于获取页面并解析html)。
[*]代码注释掉了线程池并发下载,原因是下载过快会触发网站反爬机制,导致下载失败。
[*]代码中formUrl 的链接是wallpapershome中的一个壁纸分类的链接,可以自定义其他分类链接
[*]代码中发起请求时设置的Cookie是通过浏览器控制台直接拿出来的,如果Cookie不可用可自己在浏览器中访问该网站,通过控制台查看任意请求即可获取到
2024.03.24优化,加入多线程批量下载、失败重试(失败后等待一分钟后自动重试),提高下载效率
V2版本
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.StreamProgress;
import cn.hutool.core.util.RandomUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
public class Download {
//固定线程池
private final static ExecutorService executor = Executors.newFixedThreadPool(10);
/**
* 爬取的壁纸分页网址
*/
private final String formUrl = "https://wallpapershome.com/nature";
/**
* 获取的壁纸网址(用于获取文件下载链接和分辨率)
*/
private final String mainUrl = "https://wallpapershome.com";
/**
* 文件保存路径
*/
private static final String filePath = "D:\\壁纸\\";
/**
* 总页数
*/
private static Integer pageCount = 1;
private static Integer page = 1;
private static AtomicInteger downCount = new AtomicInteger(0);
private static AtomicBoolean isCheckFileName = new AtomicBoolean(true);
private static Lock lock = new ReentrantLock();
private static Condition condition = lock.newCondition();
public static void main(String[] args) {
Download download = new Download();
//已下载的文件列表
List<String> fileNameList = Collections.synchronizedList(FileUtil.listFileNames(filePath));
try {
while (page <= pageCount) {
List<String> urllist = download.getForm(page);
System.out.println("=========================开始下载第" + page + "页==========================");
if (page == 1) {
System.out.println("============================总共" + pageCount + "页============================");
}
//等待当前页面下载完成后才进行下一个任务
CountDownLatch latch = new CountDownLatch(urllist.size());
for (String url : urllist) {
executor.submit(() -> {
boolean success = false;
//失败重试次数
int retries = 3;
do {
if (success) {
break;
}
try {
String downloadUrl = download.getDownloadUrl(url);
String numberFileName = getNumberFileName(downloadUrl);
if (fileNameList.contains(numberFileName) && isCheckFileName.get()) {
System.out.println(numberFileName + "文件已下载,跳过");
break;
} else {
isCheckFileName.set(false);
download.download(downloadUrl);
success = true;
}
download.download(downloadUrl);
success = true;
} catch (IOException e) {
System.out.println("网络路径:" + url);
System.out.println("文件下载失败:" + e.getMessage());
retries--;
//休眠10后重试
try {
lock.lock();
//通知其他线程进行等待
condition.awaitNanos(60_000_000_000L);
} catch (Exception ex) {
System.out.println(ex.getMessage());
}finally {
lock.unlock();
}
}
} while (retries > 0);
latch.countDown();//计数器减1
});
}
latch.await();//等待所有线程执行完
System.out.println("第" + page + "页下载完成");
page++;
Thread.sleep(RandomUtil.randomInt(1000, 1500));
}
} catch (Exception e) {
System.out.println("文件下载失败:" + e.getMessage());
e.printStackTrace();
} finally {
executor.shutdown();
System.out.println("============所有文件下载完成,本次任务成功下载" + downCount.get() + "个文件========");
}
}
public List<String> getForm(Integer pageNum) throws IOException {
String url = formUrl;
if (Objects.nonNull(pageNum)) {
url = formUrl + "?page=" + pageNum;
}
Connection connect = Jsoup.connect(url);
connect.header("Cookie", "sid=g4m2ugp3v0sicdutrpqeepeqa1; screen-width=1440; screen-height=900; ratio=2; window-width=1392; col-width=1115; window-height=744; FCNEC=%5B%5B%22AKsRol_46hf2y-AJJMJjAiq0OgWf0wejs54kaJMdEnCoI9yKSQPj6AOgAgEB9oidBjSxbXtAMX58lbRNiBUaM_4v3DLqN2o1hxYouVGtBgYBjoEZIlzHd1hhNhXe_-RKSylBcQnBlysPs5cujvl6p2tJcUIHEgX8zA%3D%3D%22%5D%5D");
connect.header("Host", "wallpapershome.com");
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0");
Document doc = connect.get();
// 提取所有链接
Elements links = doc.select("a");
List<String> result = new ArrayList<>();
for (Element link : links) {
String href = link.attr("href");
if (StrUtil.isNotBlank(href) && href.endsWith(".html")) {
//下载页面
result.add(mainUrl + href);
} else if (StrUtil.isNotBlank(href) && href.startsWith("?page")) {
int num = Integer.parseInt(href.substring(6));
pageCount = pageCount < num ? num : pageCount;
}
}
return result;
}
/**
* 获取最高分辨率的壁纸下载链接
*
* @return
* @throws IOException
* @Param url
*/
private String getDownloadUrl(String url) throws IOException {
Connection connect = Jsoup.connect(url);
connect.header("Cookie", "sid=g4m2ugp3v0sicdutrpqeepeqa1; screen-width=1440; screen-height=900; ratio=2; FCNEC=%5B%5B%22AKsRol8iocKLTfJ3-7NEGMVingOM01ugKcNMT6KJgW7mBKWkdyKrX4nFYMvemNKhehcHRa_WEev2BEc7SYo0mloS-q1g0HF_AV6RB4n_YPQx9Q23ZHR_F3TuBMUJ0Em3K41uJ1nWPVtn93l4f-MPJnIaHa3b0hJH0w%3D%3D%22%5D%5D; window-width=1396; window-height=750; col-width=1118");
connect.header("Host", "wallpapershome.com");
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0");
Document doc = connect.get();
List<String> links = new ArrayList<>();
doc.select("a").forEach(link -> links.add(link.attr("href")));
List<String> paragraphs = new ArrayList<>();
doc.select("p").forEach(paragraph -> paragraphs.add(paragraph.text()));
int max = 0;
String maxRatio = "";
String maxRatio2 = "";
for (String paragraph : paragraphs) {
String[] s = paragraph.split(" ");
if (s.length > 1 && s.contains("x")) {
String[] xes = s.split("x");
if (xes.length > 1) {
String x = xes;
if (x.matches("\\d+") && max < Integer.parseInt(x) && links.stream().anyMatch(link -> link.contains(s))) {
max = Integer.parseInt(x);
maxRatio = s;
maxRatio2 = s;
}
}
}
}
// System.out.println("最大分辨率:" + maxRatio2 + " " + maxRatio);
String finalMaxRatio = maxRatio;
Optional<String> first = links.stream().filter(link -> StrUtil.isNotBlank(link) && link.contains(finalMaxRatio)).findFirst();
return first.map(s -> mainUrl + s).orElse("");
}
private void download(String url) throws IOException {
if (StrUtil.isBlank(url)) {
return;
}
String fileName = getFileName(url);
//超时时间5分钟
try {
HttpUtil.downloadFile(url, FileUtil.file(filePath + fileName), 1000 * 60 * 5, new StreamProgress() {
@Override
public void start() {
System.out.println(Thread.currentThread().getName() + ": " + fileName + "开始下载。。。。");
}
@Override
public void progress(long total, long progressSize) {
//System.out.println(Thread.currentThread().getName() + ": " + fileName + ",已下载:" + FileUtil.readableFileSize(progressSize));
}
@Override
public void finish() {
downCount.incrementAndGet();
System.out.println(Thread.currentThread().getName() + ": " + fileName + " 下载完成!");
}
});
} catch (Exception e) {
System.err.println(Thread.currentThread().getName() + ": " + fileName + " 下载失败!");
}
}
private static String getFileName(String url) {
return url.substring(url.lastIndexOf("/") + 1);
}
private static String getNumberFileName(String url) {
String[] split = url.split("-");
String fileName;
if (split.length > 1) {
fileName = split;
} else {
fileName = url.substring(url.lastIndexOf("/") + 1);
}
return fileName;
}
}
V1版本
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.StreamProgress;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpUtil;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
public class Download {
/**
* 爬取的壁纸分页网址
*/
private String formUrl = "https://wallpapershome.com/nature";
/**
* 获取的壁纸网址(用于获取文件下载链接和分辨率)
*/
private String mainUrl = "https://wallpapershome.com";
/**
* 文件保存路径
*/
private String filePath = "F:\\壁纸\\";
/**
* 总页数
*/
private static Integer pageCount = 1;
private static Integer page = 1;
private static AtomicInteger downCount = new AtomicInteger(0);
public static void main(String[] args) {
Download download = new Download();
// ExecutorService executor = Executors.newFixedThreadPool(5);
try {
while (page <= pageCount) {
List<String> urllist = download.getForm(page);
System.out.println("=========================开始下载第" + page + "页==========================");
if (page == 1) {
System.out.println("============================总共" + pageCount + "页============================");
}
// 创建CompletionService
/* CompletionService<Void> completionService = new ExecutorCompletionService<>(executor);
for (String url : urllist) {
completionService.submit(() -> {
String downloadUrl = download.getDownloadUrl(url);
download.download(downloadUrl);
return null;
});
}
// 获取并处理任务结果
for (int i = 0; i < urllist.size(); i++) {
completionService.take().get(); // 这里使用take()方法获取最先完成的任务结果
}*/
for (String url : urllist) {
String downloadUrl = download.getDownloadUrl(url);
download.download(downloadUrl);
}
page++;
System.out.println("第"+page + "页下载完成");
Thread.sleep(1500);
}
} catch (Exception e) {
System.out.println("文件下载失败:" + e.getMessage());
e.printStackTrace();
} finally {
// executor.shutdown();
System.out.println("============所有文件下载完成,本次任务成功下载" + downCount.get() + "个文件========");
}
}
public List<String> getForm(Integer pageNum) throws IOException {
String url = formUrl;
if (Objects.nonNull(pageNum)) {
url = formUrl + "?page=" + pageNum;
}
Connection connect = Jsoup.connect(url);
connect.header("Cookie", "sid=g4m2ugp3v0sicdutrpqeepeqa1; screen-width=1440; screen-height=900; ratio=2; window-width=1392; col-width=1115; window-height=744; FCNEC=%5B%5B%22AKsRol_46hf2y-AJJMJjAiq0OgWf0wejs54kaJMdEnCoI9yKSQPj6AOgAgEB9oidBjSxbXtAMX58lbRNiBUaM_4v3DLqN2o1hxYouVGtBgYBjoEZIlzHd1hhNhXe_-RKSylBcQnBlysPs5cujvl6p2tJcUIHEgX8zA%3D%3D%22%5D%5D");
connect.header("Host", "wallpapershome.com");
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0");
Document doc = connect.get();
// 提取所有链接
Elements links = doc.select("a");
List<String> result = new ArrayList<>();
for (Element link : links) {
String href = link.attr("href");
if (StrUtil.isNotBlank(href) && href.endsWith(".html")) {
//下载页面
result.add(mainUrl + href);
} else if (StrUtil.isNotBlank(href) && href.startsWith("?page")) {
int num = Integer.parseInt(href.substring(6));
pageCount = pageCount < num ? num : pageCount;
}
}
return result;
}
/**
* 获取最高分辨率的壁纸下载链接
*
* @Param url
* @return
* @throws IOException
*/
private String getDownloadUrl(String url) throws IOException {
Connection connect = Jsoup.connect(url);
connect.header("Cookie", "sid=g4m2ugp3v0sicdutrpqeepeqa1; screen-width=1440; screen-height=900; ratio=2; FCNEC=%5B%5B%22AKsRol8iocKLTfJ3-7NEGMVingOM01ugKcNMT6KJgW7mBKWkdyKrX4nFYMvemNKhehcHRa_WEev2BEc7SYo0mloS-q1g0HF_AV6RB4n_YPQx9Q23ZHR_F3TuBMUJ0Em3K41uJ1nWPVtn93l4f-MPJnIaHa3b0hJH0w%3D%3D%22%5D%5D; window-width=1396; window-height=750; col-width=1118");
connect.header("Host", "wallpapershome.com");
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0");
Document doc = connect.get();
List<String> links = new ArrayList<>();
doc.select("a").forEach(link -> links.add(link.attr("href")));
List<String> paragraphs = new ArrayList<>();
doc.select("p").forEach(paragraph -> paragraphs.add(paragraph.text()));
int max = 0;
String maxRatio = "";
String maxRatio2 = "";
for (String paragraph : paragraphs) {
String[] s = paragraph.split(" ");
if (s.length > 1 && s.contains("x")) {
String[] xes = s.split("x");
if (xes.length > 1) {
String x = xes;
if (x.matches("\\d+") && max < Integer.parseInt(x) && links.stream().anyMatch(link -> link.contains(s))) {
max = Integer.parseInt(x);
maxRatio = s;
maxRatio2 = s;
}
}
}
}
System.out.println("最大分辨率:" + maxRatio2 + " " + maxRatio);
String finalMaxRatio = maxRatio;
Optional<String> first = links.stream().filter(link -> StrUtil.isNotBlank(link) && link.contains(finalMaxRatio)).findFirst();
return first.map(s -> mainUrl + s).orElse("");
}
private void download(String url) throws IOException {
if (StrUtil.isBlank(url)) {
return;
}
String[] split = url.split("-");
String fileName;
if (split.length > 1) {
fileName = split;
} else {
fileName = url.substring(url.lastIndexOf("/") + 1);
}
//超时时间5分钟
HttpUtil.downloadFile(url, FileUtil.file(filePath + fileName), 1000 * 60 * 5, new StreamProgress() {
@Override
public void start() {
System.out.println(Thread.currentThread().getName() + ": " + fileName + "开始下载。。。。");
}
@Override
public void progress(long total, long progressSize) {
System.out.println(Thread.currentThread().getName() + ": " + fileName + ",已下载:" + FileUtil.readableFileSize(progressSize));
}
@Override
public void finish() {
downCount.incrementAndGet();
System.out.println(Thread.currentThread().getName() + ": " + fileName + " 下载完成!");
}
});
}
} 观摩观摩,学习一下{:1_911:} 来学习一下 观摩观摩,学习一下{:1_893:} Thanks for sharing... Thank you 阔以,正好换点壁纸 大佬求教一下如何应用 看不懂但支持 虽然不会,但是感觉很厉害