本帖最后由 w411024 于 2020-9-20 15:38 编辑
答应大家的 下午爬不咳嗽的图片 依旧不漏网址 仅用于学习用途
爬取速度有点慢 不知道是我网络问题还是网站响应慢 也可能是代码问题 如果有大佬可以帮忙优化一下 谢谢!!!
觉得好的话可以评论一下
成品:https://www.lanzoux.com/iM5Ofgsfv4j
[Python] 纯文本查看 复制代码 # UTF-8
# author mimang
import requests
from bs4 import BeautifulSoup
import os
def getHtml(url):
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = BeautifulSoup(response.text, 'lxml')
return html
def sub(strings, p, c):
new = []
for s in strings:
new.append(s)
new[p] = str(c)
return ''.join(new)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cookie': 'UM_distinctid=174a9141007f29-0697ac1aa46086-3971095d-19fd10-174a9141008cbb; CNZZDATA1279234273=1349150808-1600561283-%7C1600561283'
}
url = 'www.xxx.com'
imgPageCount = 38
path = input('输入要保存到哪个位置:')
if not(os.path.exists(path)):
os.mkdir(path)
print('路径已创建')
html = getHtml(url + '/home/index.html')
allImageUrl = html.select('#menu > ul:nth-child(1) > li > a')
print(len(allImageUrl)) #图片分类数量
for a in range(1, len(allImageUrl)): #8
os.chdir(path)
dirName = allImageUrl[a].get_text();
if not(os.path.exists(dirName) ):
os.mkdir(dirName)
for b in range(1,imgPageCount+1):
string = allImageUrl[a].get('href')
imgHtml = getHtml(url+sub(string,20,b))
imgList_name = imgHtml.select('#hellobox > div.vodlist.piclist > div:nth-child(1) > a > div.vodname')
imgList_url = imgHtml.select('#hellobox > div.vodlist.piclist > div:nth-child(1) > a')
for c in range(0, len(imgList_name)):
os.chdir(path+'/'+dirName)
dirName2 = imgList_name[c].get_text()
if not(os.path.exists(dirName2)):
os.mkdir(dirName2)
os.chdir(dirName2)
print(imgList_url[c].get('href'))
imgList_html = getHtml(url+imgList_url[c].get('href'))
imgList = imgList_html.select('#hellobox > div.newsbody > div.nbodys > img')
for d in range(0,len(imgList)):
img = requests.get(imgList[d].get('src'),headers)
f = open(str(d+1)+'.jpg','ab')
f.write(img.content)
print('爬取'+str(d+1)+'张')
f.close()
print('完毕') |