python:下载出现问题时要处理
图片总页数未知,一直下载,直到无法下载的页面重复十次也下载不下来。存在的网页短线一次两次都没关系,重新链接十次机会。from time import sleep
import requests
import os
def download_image(url, filename, max_retries=10, min_size=10 * 1024):
retries = 0
while retries < max_retries:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari'}
response = requests.get(tp, stream=True, headers=headers)
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
# 检查文件大小
if os.path.getsize(filename) < min_size:
os.remove(filename)# 删除小于指定大小的文件
print(f"File {filename} is too small. Skipping...")
return False
else:
print(f"File {filename} downloaded successfully.")
return True
except requests.RequestException as e:
print(f"Error downloading {url}: {e}. Retrying...")
retries += 1
sleep(2)# 在重试之前等待一段时间
print(f"Failed to download {url} after {max_retries} retries.")
return False
ts = +
# ts =
for t in ts:
# 新建文件夹
img_dir = f'd:/a/{t}'
if not os.path.exists(img_dir):
os.makedirs(img_dir)
i = 1
while True:
tp = f'http://zx.my.gov.cn/lib/Book/ImageProcess?file=/files/{t}/{i}.jpg&width=1140&height=1600'
img_path = f'{img_dir}/img_{str(i).zfill(3)}.jpg'
i += 1
if not download_image(tp, img_path):
break 试试看
import time
import requests
import os
import threading
def download_image(url, filename, max_retries=10, min_size=10 * 1024):
retries = 0
while retries < max_retries:
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari'}
response = requests.get(url, stream=True, headers=headers)
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
# 检查文件大小
if os.path.getsize(filename) < min_size:
os.remove(filename)# 删除小于指定大小的文件
print(f"File {filename} is too small. Skipping...")
return False
else:
print(f"File {filename} downloaded successfully.")
return True
except requests.RequestException as e:
print(f"Error downloading {url}: {e}. Retrying...")
retries += 1
time.sleep(3)# 增加延迟时间
print(f"Failed to download {url} after {max_retries} retries.")
return False
def download_images_in_range(ts):
for t in ts:
img_dir = f'd:/a/{t}'
if not os.path.exists(img_dir):
os.makedirs(img_dir)
i = 1
while True:
tp = f'http://zx.my.gov.cn/lib/Book/ImageProcess?file=/files/{t}/{i}.jpg&width=1140&height=1600'
img_path = f'{img_dir}/img_{str(i).zfill(3)}.jpg'
if not download_image(tp, img_path):
break
i += 1
# 定义要下载的范围
ts = +
# 定义线程数量
num_threads = 5
# 将任务分配给不同的线程
threads = []
for i in range(num_threads):
start_index = i * (len(ts) // num_threads)
end_index = (i + 1) * (len(ts) // num_threads) if i < num_threads - 1 else len(ts)
thread = threading.Thread(target=download_images_in_range, args=(ts,))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print("All threads have finished downloading images.")
调味包 发表于 2024-4-21 14:15
试试看
import time
import requests
又学到了,太感谢了!1多线程;2一行代码完成多个任务。7个小时的任务下载1个小时多就可以完成。
页:
[1]