J站-绝对领域 图片下载
这是我刚学python写的,不知道现在还能不能用,第一次发帖,请多包涵
1.没用多线程
2.每个页面的图片分文件夹保存
3.没用代{过}{滤}理,我自己试的时候爬了大概100多组图然后被禁IP了,等一段时间就好
[Python] 纯文本查看 复制代码 import urllib.request
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import quote
def checkNameValid(name=None):
"""
检测Windows文件名称!过滤不能含有的特殊字符
"""
if name is None:
print("name is None!")
return
reg = re.compile(r'[\\/:*?"<>|\r\n]+')
valid_name = reg.findall(name)
if valid_name:
for nv in valid_name:
name = name.replace(nv, "_")
return name
def get_url(url, page):
'''
遍历原始网页,获取组图页面的url
:param url: 绝对领域 图集页面
:param page: 图集页面的页码
:return: 组图url的list
'''
postdata = urllib.parse.urlencode({
'paged': '%d' % page
}).encode('utf-8')
request = urllib.request.Request(url, postdata)
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, 'lxml')
for link in soup.find_all('a', attrs={'class': 'link-block'}):
url_list.append(link.get('href'))
a_list = list(set(url_list))
return a_list
def get_img(url):
'''
获取每个图集url中的图片地址
:param url: 图集url
:return: 图片url
'''
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, "html.parser")
# for link in soup.find_all('img', class_='alignnone size-medium'):
# img_list.append(link.get('src'))
for link in soup.findAll('div', id='entry-content'):
for img in link.findAll('img'):
img_list.append(img.get('src'))
for title in soup.findAll('h1', attrs={'class': 'entry-title'}):
title = title.string
a_list = list(set(img_list))
return a_list, title
if __name__ == '__main__':
url = 'http://www.jdlingyu.fun/tuji/'
url_list = []
img_list = []
title_list = []
star = int(input('请输入开始页码:'))
end = int(input('请输入结束页码:')) + 1
for page in range(star, end):
a_list = get_url(url, page)
# add_url_list(a_list)
print('正在获取链接,第%d页。' % page)
for url in url_list:
img_list = []
img_list, title = get_img(url)
title = checkNameValid(title)
print(title)
path = '%s' % title
os.mkdir(path)
x = 0
for img in img_list:
try:
urllib.request.urlretrieve(img, '%s//' % title + '%d.jpg' % x)
x += 1
print('正在下载第%d张图片'%x)
except:
print('错误!跳过---------------------------------------------------------')
|