本帖最后由 cjs25 于 2020-3-15 11:20 编辑
今天学习在练爬https://www.ivsky.com/bizhi/nvxing_1920x1080/这个网站的图试试,可是都是代码有点能爬四五张 ,有时就只爬一张就出错了,不知是 不是网站有反爬的功能,求大神帮指点一下。就是有时ADDRLIST3会没有获得数据,list index out of range。还有每回保存文件都会提示保存失败,可是图还是会保存下来的。不知哪里出的问题
[Asm] 纯文本查看 复制代码 import requests,random,os
from lxml import etree
#url='https://www.ivsky.com/bizhi/nvxing_1920x1080/'
#url = 'https://www.meitulu.com/rihan/'
def ranheader():
user1 = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
user2 = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
user3 = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2'
user4 = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
user5 = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
user6 = 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
user7 = 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'
user8 = 'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
list1 = [user1, user2, user3, user4, user5, user6, user7, user8]
agent = random.choice(list1)
header = {'User-Agent': agent,'Referer': 'https://www.ivsky.com/bizhi/oumei_meinv_v58557/'}
return header
def fanwen(url):
header=ranheader()
persone=requests.get(url,headers=header)
# persone=requests.get(url)
# html=etree.HTML(persone.content.decode())
return persone
def xeixi(persone):
html1 = etree.HTML(persone.content.decode())
addlist1= html1.xpath('//ul[@class="ali"]/li/div[@class="il_img"]/a/@href')
filename1=html1.xpath('//ul[@class="ali"]/li/div[@class="il_img"]/a/@title')
#print(addlist1)
# print(filename1)
for j in range(0,len(addlist1)-1):
file_path = "f:/明星/" + filename1[j]
if not os.path.exists(file_path):
os.makedirs(file_path)
print(file_path + '创建成功')
else:
pass
os.chdir(file_path)
url1_new=url1+str(addlist1[j])
html2=fanwen(url1_new)
html3 = etree.HTML(html2.content.decode())
addlist2=html3.xpath('//ul[@class="pli"]/li/div[@class="il_img"]/a/@href')
print(addlist2)
for k in range(0,len(addlist2)):
url2_new=url1+addlist2[k]
persone1=fanwen(url2_new)
html3 = etree.HTML(persone1.content.decode())
addlist3=html3.xpath('//div/div/div/img[@id="imgis"]/@src')
print(addlist3)
#for m in range(0,len(addlist3)-1):
# addr=addlist3[k].replace("//","")
filename=str(k+1)+".jpg"
print('正在保存第'+str(k+1)+'张')
addr="http:"+str(addlist3[0])
filename2=fanwen(addr)
try:
# urllib.request.urlretrieve(new_addr, srt(k) + ".jpg")
# time.sleep(0.5)
with open(str(k+1) + ".jpg", 'wb') as f:
f.write(filename2.content)
f.close()
time.sleep(1)
except:
print("文件保存错误!")
#print(addlist2)
if __name__ == '__main__':
url='https://www.ivsky.com/bizhi/nvxing_1920x1080/'
url1='https://www.ivsky.com'
kk=fanwen(url)
xeixi(kk) |