本帖最后由 baolinguo 于 2020-8-17 22:48 编辑
用楼主的代码修改一下,做一个自动采集https://www.tujigu.com/a/*/并自动下载的。
[Python] 纯文本查看 复制代码 import requests, os
from lxml import etree
import urllib
from urllib.request import urlopen
import re
def geturl(url):
global list
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Mobile Safari/537.36 Edg/84.0.522.52'
}
req = urllib.request.Request(url, headers=headers)
print('1、正在打开网址...' + url)
website = urlopen(req,timeout=120)
html = website.read().decode('utf8')
website.close()
print('2、正在查找符合条件的图片网址...')
links = re.findall('<p class="biaoti"><a target="_blank">',html)
list = []
print('3、开始准备图片网址列表内容。。。')
for link in links:
aurl = 'https://www.tujigu.com/a/' + link +'/'
list.append(aurl)
print('列表内容准备完毕,下面开始下载图片。。。')
return list
def downimg(imgurl):
newcount = len(list)
h = 1
while h < newcount:
url = list[h]
print(url)
#exit()
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Mobile Safari/537.36 Edg/84.0.522.52'
}
list1 = []
res = requests.get(url, headers=headers).text
res = etree.HTML(res)
title = res.xpath('/html/body/div[2]/div[1]/h1/text()')[0].encode('ISO-8859-1').decode('UTF-8')
page = res.xpath('//*[@id="pages"]/a/text()')
data = res.xpath('//div[@class="content"]/img/@src')
print(title)
for j in range(len(data)):
print(data[j])
list1.append(data[j])
i = 2
while i < int(page[-2]) + 1:
urls = url + '%s.html' % i
res = requests.get(url=urls, headers=headers).text
res = etree.HTML(res)
data = res.xpath('//div[@class="content"]/img/@src')
for j in range(len(data)):
print(data[j])
list1.append(data[j])
i += 1
path = './%s/' % title
if not os.path.exists(path): # 判断如果文件不存在,则创建
os.makedirs(path)
print("目录创建成功")
else:
print("目录已经存在")
print('开始下载!!!')
for i in range(len(list1)):
jpg_url = list1[i]
res = requests.get(jpg_url).content
with open('%s/%s.jpg' % (title, i), 'wb') as fp:
fp.write(res)
print('第' + str(i) + '张图片下载完成!')
print('第' + str(h) + '个图片网址下载完成!!!')
h += 1
if __name__ == '__main__':
print('准备开始工作了。。。')
page = 1
while page <50:
url = 'https://www.tujigu.com/zhongguo/' + str(page) +'.html'
geturl(url)
downimg(list) |