本帖最后由 森岛帆高 于 2020-2-11 21:21 编辑
图站的网站是https://www.ivsky.com/index.php[Python] 纯文本查看 复制代码 from multiprocessing.dummy import Pool as ThreadPool
import requests
from bs4 import BeautifulSoup as bs
import os
import time
import json
import re
# 传入index,获取当前页的index
def getJson(index):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
url = 'https://www.ivsky.com/index.php?tn=indexload&page=%s&h=1581405277021' % index
html = requests.get(url=url, headers=headers)
try:
data = json.loads(html.text, strict=False)
print('正在爬取第'+str(index)+'个JSON返回的数据....')
except json.decoder.JSONDecodeError:
return False
# 判断json是否为空
if data:
return data
else:
return False
# 获取json数据中的图集url
def getUrls(data):
urls = []
for i in data:
urls.append('https://www.ivsky.com' + i['arcurl'])
return urls
# 传入url得到网页内容返回BeautifulSoup对象
def sp(url):
html = requests.get(url)
soup = bs(html.content, 'html.parser')
return soup
# 从传入的BeautifulSoup对象获取图片url,返回图片url列表
def imgList(soup):
imgs = []
img = soup.select("div .il_img>a>img")
for a in img:
imgs.append('https:' + a['src'])
return imgs
# 下载图片
def downImg(index):
# 获取json数据
data = getJson(index)
# 如果获取失败直接return,这里不知道为啥,如果不返回线程在最后不会终止
if not data:
return
# 获取图集url列表
urls = getUrls(data)
# 遍历url列表
for url in urls:
# 根据url获取BeautifulSoup对象
soup = sp(url)
# 传入BeautifulSoup对象获取图片url列表
imgBox = imgList(soup)
# 获取标题
title = soup.find('h1').text[0:-1]
# 将标题设置为文件夹名字
if not os.path.exists('file/%s' % title):
try:
os.makedirs('file/%s' % title)
except FileExistsError:
pass
# 遍历图片url列表
for img in imgBox:
#处理图片链接,这时获取的图片不是高清,观察链接发现替换一段字母就行了
img=re.sub('/t/','/pre/',img)
tu = requests.get(img)
# 处理图片链接,得到图片名字,写入图片文件
with open('file/%s/%s' % (title, img.split('/', 8)[8]), mode='wb') as obj:
obj.write(tu.content)
# 休息3秒钟
time.sleep(3)
if __name__ == '__main__':
# 创建多线程池
pool = ThreadPool(8)
pool.map(downImg, range(3500))
pool.close()
pool.join()
爬取的图片如下
|