acr05s 发表于 2020-10-10 09:43

自己写的一个豆瓣相册的

本帖最后由 acr05s 于 2020-10-10 09:53 编辑

虽然网上很多现成的,但是我想自己写一个,直接输入相册ID,然后就等着爬完就行了,全是大图,用了递规,效率感觉有点慢。想着怎么弄到网页上,还在研究。
import requests
from bs4 import BeautifulSoup
import time,re,os,zipfile
from random import randint
from urllib import request
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
def getFirstPic(album_id):
    # head = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    album_base_url ="https://www.douban.com/photos/album/"
    album_url = album_base_url + album_id
    album_html = requests.get(album_url,headers= headers)
    album_soup = BeautifulSoup(album_html.text,'lxml')
    first_pic_url= album_soup.find('div',{'class':'photo_wrap'}).find('a')
    #print(first_pic_url["href"])
    first_pic_id = first_pic_url["href"].split("/")[-2]
    print("成功找到相册第一张照片地址!\n " + first_pic_id)
    return first_pic_id

def getAllPicList(AllPicList,Fisrt_Pic_Id):   
    # AllPicList.append(Fisrt_Pic_Id)
    # headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    Pic_Base_url = "https://www.douban.com/photos/photo/"
    Pic_First_Link = Pic_Base_url + Fisrt_Pic_Id
    pic_html = requests.get(Pic_First_Link,headers= headers)
    pic_soup = BeautifulSoup(pic_html.text,'lxml')
    Next_Pic_Url = pic_soup.find('a', {'class':'mainphoto'})
    Next_Pic_Id = Next_Pic_Url["href"].split("/")[-2]#获取照片第二张ID   
    ifNext_Pic_Id not in AllPicList:      
      print("成功找到照片:",Next_Pic_Id,end ="\t")            
      AllPicList.append(Next_Pic_Id)
      # print(AllPicList)
      print(f"当前共读取{len(AllPicList)}个地址")
      # time.sleep(0.5)
      getAllPicList(AllPicList,Next_Pic_Id)
    else:
      print(f"成功找到{len(AllPicList)}张照片")
      # return AllPicList

def getAllPic(album_id,Pics):
    Pic_Base_url = f"https://img9.doubanio.com/view/photo/l/public/p"
    for pic in Pics:
      # print(pic)
      img = requests.get(Pic_Base_url + pic + '.webp',headers= headers)
      # img = request.urlopen(Pic_Base_url + pic + '.webapp')
      with open(os.getcwd()+"\\"+ album_id +"\\" + pic + ".jpg","wb") as f :
            f.write(img.content)
            print(f"成功下载{pic}!")
            f.close()
            time.sleep(randint(1, 3))   
album_id = input("输入你想爬取的相册ID:")
if not os.path.exists(album_id):
      os.makedirs(album_id)
AllPicList = []
first_pic_id =getFirstPic(album_id)
iffirst_pic_id:
    getAllPicList(AllPicList,first_pic_id)
else:
    print("发生错误")

getAllPic(album_id,AllPicList)

Tomatoman 发表于 2020-10-10 13:17

感谢发布原创作品,给你点赞{:1_921:}

L13 发表于 2020-10-10 15:40

相册ID不好弄啊
不过还是给楼主点赞啦
页: [1]
查看完整版本: 自己写的一个豆瓣相册的