爱与萍 发表于 2020-6-29 11:46

Python爬取豆瓣博客图片

之前一直看别人发爬虫,,偶尔间发一个喜欢的博主,日常拍的也是非常接地气的照片,就想下载来看看
就尝试了模仿写了一段代码,由于没有写过,代码有点粗糙,功能都能实现,需要的人可以看看
#豆辨博客照片爬取
import requests
import os
import random
from bs4 import BeautifulSoup
import time
import imageGUI#下载完图片自动打开滚动显示
import globalVariable#全局的一个模块,用来存储图片的地址可以忽略
index = 'https://www.douban.com/people/benbenbear/photos?start=0'# 主网站
global firstDir
firstDir='C:/Users/Administrator/Desktop/s'#存储目录
classificationDict={}#字典存目录
def screen(url,select):
    headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
    }
    html = requests.get(url, headers=headers)
    html=html.text
    soup=BeautifulSoup(html,'lxml')
    return soup.select(select)
def getPage(index):
    select='span.thispage'
    links=screen(index,select)
    if links.__len__()==0:
      return
    page=links.get('data-total-page')
    print(page+'页')
    choice = input("请输入要下载第几页")
    if choice == '1':
      init_classiffication(index)
      return
    if int(choice) <= int (page):
      listIndex = list(index)
      m=(int(choice)-1)*18
      listIndex[-1] = str(m)
      index = ''.join(listIndex)
      init_classiffication(index)
    else:
      print("重新来")
      getPage(index)

def init_classiffication(index):
    url=index
    select='div.photo_wrap a.photolst_photo '
    classiffications=screen(url,select)
    for t in range(0,len(classiffications)):
      handleImage(classiffications.get('href'),classiffications.get('title'))
       # print(classiffications.string)
       # print(href.get('href'))
    imageGUI.gui(globalVariable.imgUrl)
def handleImage(links,linksName):
    select='a.mainphoto img'
    href=screen(links,select)
    for h in href:
      src=h.get('src')
      if src[-4]=='.':
            name = linksName
            downLoad(src,name)


def downLoad(src,name):
    if src==' ':
      return
    print(src)
    headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'
    }
    response=requests.get(src,headers=headers)
    time.sleep(1)
    if src[-3]=='p':
      p = finalDir + '/' + name + '.png'
    elif src[-3]=='j':
      p=finalDir + '/' + name + '.jpg'
    else:
      p = finalDir + '/' + name + '.gif'
    p=p.replace('\n',' ')
    print(p)
    with open(p,'wb') as f:
      f.write(response.content)
    p=p.replace("\\","/")
    globalVariable.imgUrl.append(p)
def test():
    select='div.albumlst div.albumlst_r a'
    classification=screen(index,select)
    for s in classification:
      secondDir = firstDir + '/'+s.string
      href=s.get('href')
      classificationDict={
            'path' : secondDir,
            'url': href
      }
def GUI():
    for c in classificationDict:
      print(c,end='\n')
    choice=input("请输入要下载的")
    globalindex
    index=classificationDict['url']
    index+='/?m_start=0'
    path=classificationDict['path']
    if(not os.path.exists(path)):
      os.mkdir(path)
    globalfinalDir
    finalDir=path
    getPage(index)
def main(): #主函数
    if(not os.path.exists(firstDir)):
      os.mkdir(firstDir)
    test()
    GUI()
if __name__ == '__main__':
    main()


页: [1]
查看完整版本: Python爬取豆瓣博客图片