刚学python不久 也是看别人的例子学习的 爬取页代码获取有问题 导致不能完整爬取 懒得改了 仅供参考 不喜勿喷
[Python] 纯文本查看 复制代码 import requests
import re
from bs4 import BeautifulSoup
import sys
url = 'https://www.meitulu.com/'
def getHTMLTEXT(url):
try:
headers= headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip, deflate',
'Connection':'keep-alive',
'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
}
response = requests.get(url,headers=headers)
# raise_status_code()
response.encoding = 'utf-8'
return response.text
except:
pass
def getURLlist(list): #获取每个链接的完整链接
html=getHTMLTEXT(url)
soup=BeautifulSoup(html,'html.parser')
urllist = soup.select('div.boxs > ul > li > a ')
for fhref in urllist:
try:
href = fhref.attrs['href']
hrefall = 'https://www.meitulu.com/' + href
list.append(hrefall)
except:
continue
return list
def getpagelist(pageurl,pagelist): #传入一个链接 获取这个个链接的每个页链接
pagehtml = getHTMLTEXT(pageurl)
soup=BeautifulSoup(pagehtml,'html.parser')
pageurllist = soup.select('html body center div#pages a')
for pageurl in pageurllist:
pagehref = 'https://www.meitulu.com'+ pageurl.get('href')
pagelist.append(pagehref)
pagelist.pop()
return pagelist
def downloaderpic(pagehref,picpath): #传入一个pageurl 获取页面里面每张图片的链接并且下载到指定目录
html=getHTMLTEXT(pagehref)
soup=BeautifulSoup(html,'html.parser')
pichrefs = soup.select('html body div.content center img')
for spichref in pichrefs:
pichref = spichref.get('src')
picname = pichref.split('/')[-1]
headers= headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding':'gzip, deflate',
'Connection':'keep-alive',
'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host
}
response= requests.get(pichref,headers=headers)
path = picpath+ picname
print(response.url)
with open(picpath+ picname,"wb") as f:
f.write(response.content)
f.close
#def downloaderpic(pageurl):
def main():
"""
urllists 是首页中所有页面的集合
pagelist 是每个链接里面的所有页的集合
"""
picpath = "D:/pic/"
ulist=[]
urllists = getURLlist(ulist)
plist =[]
howurl = len(urllists)
flag = 0
for urllist in urllists:
pagelist= getpagelist(urllist,plist) #对页面遍历获取所有每个链接的页面list
flag = flag +1
howpage = 0+len(pagelist)
count = 0
for pagehref in pagelist: #对页面list遍历获取所有页面的href
downloaderpic(pagehref,picpath)
count = count +1
print('共有{0}链接正在下载第{1}个连接-------第{1}个链接有{2}个页面正在下载第{3}个页面'.format(howurl,flag,howpage,count),end="\r")
main()
print("下载完成,文件保存在D:/pic/目录下") |