本帖最后由 cattie 于 2023-7-24 11:40 编辑
[Python] 纯文本查看 复制代码
import threading
import time
import requests
from bs4 import BeautifulSoup
import os
exitFlag = 0
base_url = 'https://[domain_altered_for_legal_concerns].org/zh/'
top_urls = []
top_tit = []
def top_url(url):
res = requests.get(url)
html = BeautifulSoup(res.text, 'html.parser')
url_list = html.find_all('a', target='_blank')
for img_url in url_list:
img_u = img_url.get('href')
img_t = img_url.get('title')
top_urls.append(img_u)
top_tit.append(img_t)
print(img_u + '已经存入')
create_dir(img_t)
def sec_url(url, tit):
print('sec_url begin')
if url != 'https://www.[domain_altered_for_legal_concerns].net/':
print(url + '正在使用')
res = requests.get(url)
html = BeautifulSoup(res.text, 'html.parser')
url_list = html.find_all('img', loading='lazy')
for img_url in url_list:
print(img_url)
img_u = img_url.get('src')
num = str(url_list.index(img_url))
img_t = tit + num
print(img_t, img_u)
down_pic(img_u, tit, img_t)
print('sec_url end')
def down_pic(url, fil1, fil2):
res = requests.get(url)
with open(f'D:/PycharmProjects/pythonProject2/[altered_for_legal_concerns]/{fil1}/{fil2}.jpg', 'wb') as f:
f.write(res.content)
print('已经下载{}'.format(fil2))
def create_dir(name):
path = "./[altered_for_legal_concerns]/{}".format(name)
if not os.path.exists(path):
os.makedirs(path)
top_url(url=base_url)
threads = []
for top_ur, top_t in zip(top_urls, top_tit):
print(top_ur + '已经取出')
t = threading.Thread(target=sec_url, args=(top_ur, top_t))
threads.append(t)
t.start()
for t in threads:
t.join()
|