wallhave原图下载,增加去重功能
已经下载过的图片名称会存在log.txt中,每次下载图片会检测一遍,存在则跳过,自动翻页import osimport requests
from bs4 import BeautifulSoup
from tkinter import Tk, filedialog
import logging
def get_images_from_url(url):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
lazyload_images = soup.find_all('img', class_='lazyload')
image_urls = []
for lazyload_image in lazyload_images:
image_url = lazyload_image['data-src']
image_urls.append(image_url)
return image_urls
return []
def modify_image_url(image_url, formats):
image_name = image_url.split('/')[-1]
for format in formats:
modified_url = f"https://w.wallhaven.cc/full/{image_name}/wallhaven-{image_name.split('.')[-2]}.{format}"
yield modified_url
def download_images(image_urls, save_folder, page, formats):
logging.basicConfig(filename=os.path.join(save_folder, 'log.txt'), level=logging.INFO)
existing_images = set()
if os.path.exists(os.path.join(save_folder, 'log.txt')):
with open(os.path.join(save_folder, 'log.txt'), 'r') as log_file:
for line in log_file:
image_name = line.strip().split(':')[-1]
existing_images.add(image_name)
for url in image_urls:
image_name = url.split('/')[-1]
image_path = os.path.join(save_folder, image_name)
if image_name in existing_images:
print(f"图片已存在,跳过下载:{image_name}")
continue
if not os.path.exists(image_path):
downloaded = False
for modified_url in modify_image_url(url, formats):
response = requests.get(modified_url)
if response.status_code == 200:
with open(image_path, 'wb') as f:
f.write(response.content)
logging.info(f"{image_name}")
print(f"下载图片成功:{image_name}")
downloaded = True
break
if not downloaded:
print(f"所有链接下载失败:{image_name},已跳过")
print(f"处理完第 {page} 页,即将处理下一页...")
if __name__ == "__main__":
root = Tk()
root.withdraw()
save_folder = filedialog.askdirectory(title="选择存储文件夹")
if save_folder:
page = 1
image_formats = ['jpg', 'png', 'gif', 'jpeg']
while True:
url = f"https://wallhaven.cc/toplist?page={page}"
image_urls = get_images_from_url(url)
if not image_urls:
break
download_images(image_urls, save_folder, page, image_formats)
page += 1
另一个思路:搞个redis(key相同会覆盖),把每次爬取到的图片url存进去 ,最后在取出来获取到byte保存 不错!!!!! 学习了,不错。 学习了,谢谢分享! 谢谢分享 感谢分享,就是看不懂{:1_918:} 不是到是啥
学习了..... 感谢分享 研究一下 感谢楼主
页:
[1]
2