本帖最后由 话痨司机啊 于 2022-5-9 20:58 编辑
[Python] 纯文本查看 复制代码 from collections import namedtuple
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List
import re
import requests
import os
from datetime import datetime
import keyboard
from fake_useragent import UserAgent
from lxml import etree
from rich.console import Console
console = Console()
headers = {'User-Agent':UserAgent().random}
DATA = namedtuple('DATA',['year','month','day','title','href'])
url = 'https://www.vmgirls.com/archives.html'
img_list = ['jpg','png','gif','jpeg']
def start_requests():
res = requests.get(url,headers=headers)
et = etree.HTML(res.text)
# 获取全部年份
y = et.xpath('//div[@id="archives"]/h4/text()')
for year in range(1,len(y)+1):
# 每个月
m = et.xpath(f'//div[@id="archives"]//ul[{year}]/li/span/text()')
for month in range(1,len(m)+1):
# 每天
d = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li')
for day in range(1,len(d)+1):
# 每天的网址
_day = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li[{day}]/text()')[0]
_href = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li[{day}]/a/@href')[0]
_title = et.xpath(f'//div[@id="archives"]//ul[{year}]/li[{month}]/ul/li[{day}]/a/text()')[0]
yield DATA(y[year-1],m[month-1],_day,_title,_href)
def get_data(yield_func):
'''
转换数据
'''
yield from yield_func
def save_img(url,path,title):
'''
保存图片
'''
imgcs = requests.get(url,headers=headers)
et = etree.HTML(imgcs.text)
IMG = et.xpath('//div[@class="nc-light-gallery"]//@href')
for i in range(0,len(IMG)):
if IMG[i].split('.')[-1] in img_list:
res = requests.get(IMG[i],headers=headers)
with open(f'{path}/{title}_{i}.jpg','wb') as f:
f.write(res.content)
nowdate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
console.print(f'[yellow]创建时间:{nowdate}\n[yellow]保存路径:{path}\n[yellow]文件:{title}_{i}.jpg 保存成功!\n[green]提示:按esc退出')
console.print('[blue]-'*70)
if keyboard.read_key() == 'esc':
raise KeyboardInterrupt
def mkdir_path(path):
'''
创建路径
'''
path = re.sub(r'[\s]','',path)
if not os.path.exists(path):
os.makedirs(path)
return path
def main():
'''
多线程主函数
'''
with ThreadPoolExecutor(max_workers=5) as executor:
try:
for data in get_data(start_requests()):
path = mkdir_path(os.path.join(os.getcwd(),'美女壁纸',data.year,data.month,data.day[:-2]))
img_name = data.title
url = data.href
executor.submit(save_img,url,path,img_name)
except Exception as e:
console.print('[red]程序即将退出!')
os._exit(0)
if __name__ == '__main__':
main()
也是这个网站,写个多线程的给你参考,我爬了一点,小姐姐不太高清 |