本帖最后由 17788210295 于 2019-8-22 14:18 编辑
利用 pool.map 进程池 高效爬取大图片 大图片的意思就是 最下面那张图.... 记的给个评分,评论哟
编码格式为 gb2312
不说了上代码:
[Python] 纯文本查看 复制代码
# coding=gb2312
import requests
from lxml import etree
from multiprocessing import Pool
import os
from time import sleep
import random
class Down_pic():
def __init__(self):
self.headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "desk.zol.com.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
}
self.main_url = 'http://desk.zol.com.cn' # 主页
self.dic = {} # 存放图片路径
self.count = 0 # 总得图片数量
self.k = 0 # 用来取datas数据 切片
self.type = 5 # 大分类取 5个
self.small_type_num = 4 # 小分类取4个
self.tupian = 3 # 每一个小分类下载的图片数量 (最多3个)
def get_tree(self, htlm):
tree = etree.HTML(htlm)
return tree
# 获取大分类
def get_type(self):
main_page = requests.get(self.main_url, headers=self.headers).text
tree = self.get_tree(main_page)
a_list = tree.xpath('//*[@id="main"]/dl[1]/dd/a')
# 去掉全部这个分类
a_list.pop(0)
for a in a_list[0:self.type]:
type_name = a.xpath('./text()')[0]
type_url = self.main_url + a.xpath('./@href')[0]
yield type_name, type_url
# 获取小分类
def get_small_type(self):
for type_name, type_url in self.get_type():
small_page = requests.get(type_url)
small_page.encoding = 'gb2312'
tree = self.get_tree(small_page.text)
# 获取小分类
small_name_list = tree.xpath('//a[@class="pic"]/span/@title')[0:self.small_type_num] # 小分类取3个
small_url_list = tree.xpath('//a[@class="pic"]/@href')[0:self.small_type_num]
self.dic[type_name] = small_name_list
yield small_url_list
# 获取图片列表
def get_pic_list(self):
for pic_page_url in self.get_small_type():
for pic_url in pic_page_url:
url = self.main_url + pic_url
pic_page = requests.get(url=url).text
etree = self.get_tree(pic_page)
pic_list_url = etree.xpath('//*[@id="showImg"]/li/a/@href')
for pic_url in pic_list_url[:self.tupian]: # 每一个小分类 取几个图片
yield self.main_url + pic_url
# 获取图片尺寸网址页面
def get_size(self):
for pic_url in self.get_pic_list():
pic_page = requests.get(pic_url).text
etree = self.get_tree(pic_page)
try:
data_url = self.main_url + etree.xpath('//*[@id="tagfbl"]/a[2]/@href')[0] # 多数默认2880*1800
except Exception:
data_url = etree.xpath('//*[@id="bigImg"]/@src')[0]
yield data_url
# 获取图片下载地址
def get_data(self):
for url in self.get_size():
data_page = requests.get(url).text
etree = self.get_tree(data_page)
try:
pic_data_url = etree.xpath('/html/body/img[1]/@src')[0]
except Exception:
pic_data_url = url
self.count += 1
yield pic_data_url
self.num = self.count
# 开启线程下载
def ppp(self):
print('开启线程')
pool = Pool(5)
datas = pool.map(self.download, [url for url in self.get_data()])
pool.close()
pool.join()
for type_name in self.dic:
for small_name in self.dic[type_name]:
path = type_name + '/' + small_name
path = path[:path.find('?')]
if not os.path.exists(path):
os.makedirs(path)
for data in datas[self.k:self.k + self.tupian]:
name = small_name + str(random.randint(1, 1000)) # 图片名
pa = path + '/' + name + '.jpg'
with open(pa, 'wb') as f:
f.write(data)
self.k += self.tupian
print('共下载:{}图片'.format(self.count))
def download(self, url):
# print('\r当前下载进度:{}%'.format((1 - self.num / self.count) * 100), end='')
data = requests.get(url=url).content
sleep(1)
return data
if __name__ == '__main__':
down = Down_pic()
down.ppp()
注:此文章所有内容仅供学习,不允许商用,如有侵权,请联系删除,谢谢。... |