好友
阅读权限10
听众
最后登录1970-1-1
|
#目标网站https://www.ivsky.com/
#爬取目标: 爬取原图,注意!!! 不是缩略图
#出现问题: 对存有原图链接后缀的url(data)发get请求时,用session会话对象状态码200,就是打印不出数据,手动添加cookie能打印出数据,不知道是我代码的原因还是网站的原因,大佬求救啊!!!
代码:
import json
import requests
from lxml import etree
import re
import os
#创建一个session对象
sessionp = requests.Session()
if not os.path.exists('./极简壁纸爬取结果'):
os.mkdir('./极简壁纸爬取结果') # 创建保存图片的文件夹
#1.对这组图片的url发get请求,拿到每张图片的url
url = 'https://www.ivsky.com/bizhi/bubujingxin_li_v37329/'
html1 = requests.get(url=url)
#print(html1.text)
d = etree.HTML(html1.text)
slist = d.xpath('file:///C:\Users\An Wen\AppData\Roaming\Tencent\QQTempSys\@IR3P(8S$C$Z$TY~5I{QEPC.gifml/body/div[3]/div[4]/ul[img]file:///C:\Users\An Wen\AppData\Roaming\Tencent\QQTempSys\)A[Y)I~](ZC9Z[3Y)IDK7LK.gif[/img]/div/a/@href') #href="/bizhifile:///C:\Users\An Wen\AppData\Roaming\Tencent\QQTempSys\OOY5$4OW5H`8`9%(9$)T67M.gifbujingxin_li_v37329/pic_607228.html" https://www.ivsky.com/bizhi/bubujingxin_li_v37329/pic_607228.html
#html1.close()
ec = "var imgURL='(.*?)';var"
ed = "</script><img.*?src='(.*?)' alt="
hdes_1 = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0"
}
#2.对每张图的url发请请求
for li in slist: #https://www.ivsky.com/bizhi/moraine_lake_v48781/pic_769150.html"
url2 ='https://www.ivsky.com'+li
print(url2)
html2 = sessionp.get(url=url2,headers=hdes_1)
picurl = "https://www.ivsky.com/get_picinfo.php?tn=downloadpic&picurl="+re.findall(ec, html2.text)[0] #data的url
print(picurl)
hdes_2 = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0",
"referer": url2, # https://www.ivsky.com/bizhi/moraine_lake_v48781/pic_769150.
}
print(hdes_2)
# parameters = {
# "tn": "downloadpic",
# "picurl": "/img/bizhi/pic/201608/05file:///C:\Users\An Wen\AppData\Roaming\Tencent\QQTempSys\OOY5$4OW5H`8`9%(9$)T67M.gifbujingxin_li.jpg"
# }
#3.对保存原图后缀的data发请求
data_html = sessionp.get(url=picurl,headers=hdes_2)
#json_data = data_html.json()
print(data_html.text)
break
|
|
发帖前要善用【论坛搜索】功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。 |
|
|
|
|