Python爬取豆瓣博客图片

爱与萍 发表于 2020-6-29 11:46

之前一直看别人发爬虫，，偶尔间发一个喜欢的博主，日常拍的也是非常接地气的照片，就想下载来看看
就尝试了模仿写了一段代码，由于没有写过，代码有点粗糙，功能都能实现，需要的人可以看看
#豆辨博客照片爬取
import requests
import os
import random
from bs4 import BeautifulSoup
import time
import imageGUI#下载完图片自动打开滚动显示
import globalVariable#全局的一个模块，用来存储图片的地址可以忽略
index = 'https://www.douban.com/people/benbenbear/photos?start=0'# 主网站
global firstDir
firstDir='C:/Users/Administrator/Desktop/s'#存储目录
classificationDict={}#字典存目录
def screen(url,select):
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
html = requests.get(url, headers=headers)
html=html.text
soup=BeautifulSoup(html,'lxml')
return soup.select(select)
def getPage(index):
select='span.thispage'
links=screen(index,select)
if links.__len__()==0:
   return
page=links.get('data-total-page')
print(page+'页')
choice = input("请输入要下载第几页")
if choice == '1':
   init_classiffication(index)
   return
if int(choice) <= int (page):
   listIndex = list(index)
   m=(int(choice)-1)*18
   listIndex[-1] = str(m)
   index = ''.join(listIndex)
   init_classiffication(index)
else:
   print("重新来")
   getPage(index)

def init_classiffication(index):
url=index
select='div.photo_wrap a.photolst_photo '
classiffications=screen(url,select)
for t in range(0,len(classiffications)):
   handleImage(classiffications.get('href'),classiffications.get('title'))
   # print(classiffications.string)
   # print(href.get('href'))
imageGUI.gui(globalVariable.imgUrl)
def handleImage(links,linksName):
select='a.mainphoto img'
href=screen(links,select)
for h in href:
   src=h.get('src')
   if src[-4]=='.':
         name = linksName
         downLoad(src,name)

def downLoad(src,name):
if src==' ':
   return
print(src)
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
response=requests.get(src,headers=headers)
time.sleep(1)
if src[-3]=='p':
   p = finalDir + '/' + name + '.png'
elif src[-3]=='j':
   p=finalDir + '/' + name + '.jpg'
else:
   p = finalDir + '/' + name + '.gif'
p=p.replace('\n',' ')
print(p)
with open(p,'wb') as f:
   f.write(response.content)
p=p.replace("\\","/")
globalVariable.imgUrl.append(p)
def test():
select='div.albumlst div.albumlst_r a'
classification=screen(index,select)
for s in classification:
   secondDir = firstDir + '/'+s.string
   href=s.get('href')
   classificationDict={
         'path' : secondDir,
         'url': href
   }
def GUI():
for c in classificationDict:
   print(c,end='\n')
choice=input("请输入要下载的")
globalindex
index=classificationDict['url']
index+='/?m_start=0'
path=classificationDict['path']
if(not os.path.exists(path)):
   os.mkdir(path)
globalfinalDir
finalDir=path
getPage(index)
def main(): #主函数
if(not os.path.exists(firstDir)):
   os.mkdir(firstDir)
test()
GUI()
if __name__ == '__main__':
main()

页: [1]

吾爱破解 - 52pojie.cn's Archiver

Python爬取豆瓣博客图片