本帖最后由 acr05s 于 2020-10-10 09:53 编辑
虽然网上很多现成的,但是我想自己写一个,直接输入相册ID,然后就等着爬完就行了,全是大图,用了递规,效率感觉有点慢。想着怎么弄到网页上,还在研究。
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import time,re,os,zipfile
from random import randint
from urllib import request
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
[Python] 纯文本查看 复制代码 def getFirstPic(album_id):
# head = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
album_base_url ="https://www.douban.com/photos/album/"
album_url = album_base_url + album_id
album_html = requests.get(album_url,headers= headers)
album_soup = BeautifulSoup(album_html.text,'lxml')
first_pic_url= album_soup.find('div',{'class':'photo_wrap'}).find('a')
#print(first_pic_url["href"])
first_pic_id = first_pic_url["href"].split("/")[-2]
print("成功找到相册第一张照片地址!\n " + first_pic_id)
return first_pic_id
[Python] 纯文本查看 复制代码 def getAllPicList(AllPicList,Fisrt_Pic_Id):
# AllPicList.append(Fisrt_Pic_Id)
# headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
Pic_Base_url = "https://www.douban.com/photos/photo/"
Pic_First_Link = Pic_Base_url + Fisrt_Pic_Id
pic_html = requests.get(Pic_First_Link,headers= headers)
pic_soup = BeautifulSoup(pic_html.text,'lxml')
Next_Pic_Url = pic_soup.find('a', {'class':'mainphoto'})
Next_Pic_Id = Next_Pic_Url["href"].split("/")[-2] #获取照片第二张ID
if Next_Pic_Id not in AllPicList:
print("成功找到照片:",Next_Pic_Id,end ="\t")
AllPicList.append(Next_Pic_Id)
# print(AllPicList)
print(f"当前共读取{len(AllPicList)}个地址")
# time.sleep(0.5)
getAllPicList(AllPicList,Next_Pic_Id)
else:
print(f"成功找到{len(AllPicList)}张照片")
# return AllPicList
[Python] 纯文本查看 复制代码 def getAllPic(album_id,Pics):
Pic_Base_url = f"https://img9.doubanio.com/view/photo/l/public/p"
for pic in Pics:
# print(pic)
img = requests.get(Pic_Base_url + pic + '.webp',headers= headers)
# img = request.urlopen(Pic_Base_url + pic + '.webapp')
with open(os.getcwd()+"\\"+ album_id +"\\" + pic + ".jpg","wb") as f :
f.write(img.content)
print(f"成功下载{pic}!")
f.close()
time.sleep(randint(1, 3))
[Python] 纯文本查看 复制代码 album_id = input("输入你想爬取的相册ID:")
if not os.path.exists(album_id):
os.makedirs(album_id)
AllPicList = []
first_pic_id =getFirstPic(album_id)
if first_pic_id:
getAllPicList(AllPicList,first_pic_id)
else:
print("发生错误")
getAllPic(album_id,AllPicList) |