Python爬取豆瓣博客图片
之前一直看别人发爬虫,,偶尔间发一个喜欢的博主,日常拍的也是非常接地气的照片,就想下载来看看就尝试了模仿写了一段代码,由于没有写过,代码有点粗糙,功能都能实现,需要的人可以看看
#豆辨博客照片爬取
import requests
import os
import random
from bs4 import BeautifulSoup
import time
import imageGUI#下载完图片自动打开滚动显示
import globalVariable#全局的一个模块,用来存储图片的地址可以忽略
index = 'https://www.douban.com/people/benbenbear/photos?start=0'# 主网站
global firstDir
firstDir='C:/Users/Administrator/Desktop/s'#存储目录
classificationDict={}#字典存目录
def screen(url,select):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
html = requests.get(url, headers=headers)
html=html.text
soup=BeautifulSoup(html,'lxml')
return soup.select(select)
def getPage(index):
select='span.thispage'
links=screen(index,select)
if links.__len__()==0:
return
page=links.get('data-total-page')
print(page+'页')
choice = input("请输入要下载第几页")
if choice == '1':
init_classiffication(index)
return
if int(choice) <= int (page):
listIndex = list(index)
m=(int(choice)-1)*18
listIndex[-1] = str(m)
index = ''.join(listIndex)
init_classiffication(index)
else:
print("重新来")
getPage(index)
def init_classiffication(index):
url=index
select='div.photo_wrap a.photolst_photo '
classiffications=screen(url,select)
for t in range(0,len(classiffications)):
handleImage(classiffications.get('href'),classiffications.get('title'))
# print(classiffications.string)
# print(href.get('href'))
imageGUI.gui(globalVariable.imgUrl)
def handleImage(links,linksName):
select='a.mainphoto img'
href=screen(links,select)
for h in href:
src=h.get('src')
if src[-4]=='.':
name = linksName
downLoad(src,name)
def downLoad(src,name):
if src==' ':
return
print(src)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
}
response=requests.get(src,headers=headers)
time.sleep(1)
if src[-3]=='p':
p = finalDir + '/' + name + '.png'
elif src[-3]=='j':
p=finalDir + '/' + name + '.jpg'
else:
p = finalDir + '/' + name + '.gif'
p=p.replace('\n',' ')
print(p)
with open(p,'wb') as f:
f.write(response.content)
p=p.replace("\\","/")
globalVariable.imgUrl.append(p)
def test():
select='div.albumlst div.albumlst_r a'
classification=screen(index,select)
for s in classification:
secondDir = firstDir + '/'+s.string
href=s.get('href')
classificationDict={
'path' : secondDir,
'url': href
}
def GUI():
for c in classificationDict:
print(c,end='\n')
choice=input("请输入要下载的")
globalindex
index=classificationDict['url']
index+='/?m_start=0'
path=classificationDict['path']
if(not os.path.exists(path)):
os.mkdir(path)
globalfinalDir
finalDir=path
getPage(index)
def main(): #主函数
if(not os.path.exists(firstDir)):
os.mkdir(firstDir)
test()
GUI()
if __name__ == '__main__':
main()
页:
[1]