本帖最后由 barnett2016 于 2023-9-7 16:31 编辑
修改了一下
增加了下载进度提示。不改文件名。避免重复下载。
[Python] 纯文本查看 复制代码 import requests
import os
from bs4 import BeautifulSoup
url = 'https://wallhaven.cc/search'
# 如果不存在bizhi文件夹,创建壁纸文件夹
if not os.path.exists('bizhi'):
os.mkdir('bizhi')
keyword = input('请输入你要爬取的壁纸关键词:')
start_page = int(input("起始页码:"))
end_page = int(int(input("结束页码:")) + 1)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.3'
}
# 检查文件是否已下载
def check_file_downloaded(filename):
if not os.path.exists('log.txt'):
return False
with open('log.txt', 'r') as log_file:
downloaded_files = log_file.read().splitlines()
if filename in downloaded_files:
return True
else:
return False
# 将已下载的文件名写入log.txt
def write_to_log(filename):
with open('log.txt', 'a') as log_file:
log_file.write(filename+'\n')
for i in range(start_page, end_page):
data = {
'q': keyword,
'sorting': 'random',
'ref': 'fp',
'seed': 'pDRjMC',
'page': i,
}
# 请求搜索链接
resp = requests.get(url=url, params=data, headers=headers)
# 解析搜索页面
page = BeautifulSoup(resp.text, 'html.parser')
# 查找图片链接
urls = page.find_all("a", attrs={'class': 'preview'})
for n_url in urls:
href = n_url.get('href')
# 请求图片链接
resp1 = requests.get(url=href, headers=headers)
# 解析图片页面
page1 = BeautifulSoup(resp1.text, 'html.parser')
img = page1.find('img', attrs={"id": "wallpaper"})
# 查找图片下载链接
img_url = img.get('src')
# 获取文件名
file_name = img_url.split('/')[-1]
if check_file_downloaded(file_name):
print(f'文件 {file_name} 已经下载过,跳过下载')
continue
# 下载文件
resp2 = requests.get(url=img_url, headers=headers, stream=True)
# 获取文件大小
file_size = int(resp2.headers.get('Content-Length', 0))
# 下载进度和速度的初始值
download_progress = 0
download_speed = 0
# 下载文件
with open(f'./bizhi/{file_name}', 'wb') as f:
for chunk in resp2.iter_content(chunk_size=1024):
if chunk:
# 写入文件
f.write(chunk)
# 更新下载进度
download_progress += len(chunk)
# 更新下载速度
download_speed = len(chunk)
print(f'文件名: {file_name},下载速度: {download_speed} bytes/s,下载进度: {download_progress}/{file_size} bytes', end='\r')
# 将文件名写入log.txt
write_to_log(file_name)
print()
print('爬取完毕') |