多线程美女图片爬虫
逛论坛看到了一个爬虫帖子,我也想爬一爬试一试,练习一下如果代码有可以改进的地方,还请各位大佬指点指点,我学习一下思路
# -*- coding = utf-8 -*-
# @Time:2022/4/17 10:05
# @Author:宇
# @File:1111.py
# @Software:PyCharm
import requests
import os
import multiprocessing
from lxml import etree
headers = {
'Referer': 'https://www.mmlme.com/jp',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
# 获取图组链接
def get_zu_urls(pepa):
zu_title_urls = {}
url = f'https://www.mmlme.com/jp/page/{pepa}'
res = requests.get(url=url,headers=headers).text
tree = etree.HTML(res)
title = tree.xpath('//div[@class="posts-row ajaxpager"]/posts/div/h2/a/text()')
# print(len(title))
urls = tree.xpath('//div[@class="posts-row ajaxpager"]/posts/div/h2/a/@href')
# print(len(urls))
for i in range(12):
zu_title_urls] = urls
# print(zu_title_urls)
return zu_title_urls
# 得到图片url
def get_urls(urls):
res = requests.get(url=urls,headers=headers).text
tree = etree.HTML(res)
urls_list = tree.xpath('//ul/li/figure/a/@box-img')
# print(urls_list)
return urls_list
def save(name, urls_list):
a = 1
for i in urls_list:
res = requests.get(url=i,headers=headers).content
# print(a)
with open('图库/' + name + '/' + str(a) + '.jpg', 'wb') as f:
f.write(res)
a += 1
print(name+'下载完成!!!!!!!')
def main():
for pepa in range(1, 6):
# 获取图组链接
print(f'正在下载第{pepa}页......')
zu_title_urls = get_zu_urls(pepa)
# 创建文件夹
if not os.path.exists('图库'):
os.mkdir('图库')
for i in zu_title_urls.items():
if not os.path.exists('图库/'+i):
os.mkdir('图库/'+i)
print(i+'......')
# 得到图片url
urls_list = get_urls(i)
# save(i,urls_list)
# 多线程
save_process = multiprocessing.Process(target=save, args=(i,urls_list))
save_process.start()
print(f'第{pepa}页' + '下载完成!!!!!!!')
if __name__ == '__main__':
main() 对楼主的代码做了一些修改:
1,引入线程池,可以自定义线程数量,默认为8
2,已下载的图片不再重复下载,如果程序下到一半中途停止,下次运行只下载未下载的图片
3,下载该网站所有页数直到返回404
4,一些细节修改
# -*- coding = utf-8 -*-
# @Time:2022/4/17 10:05
# @Author:宇
# @File:1111.py
# @Software:PyCharm
import requests
import os
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
thread_num = 8# 自定义线程数量
headers = {
'Referer': 'https://www.mmlme.com/jp',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
# 获取图组链接
def get_zu_urls(pepa):
zu_title_urls = {}
url = f'https://www.mmlme.com/jp/page/{pepa}'
res = requests.get(url=url, headers=headers)
if res.status_code == 404:
return 0
tree = etree.HTML(res.text)
titles = tree.xpath('//div[@class="posts-row ajaxpager"]/posts/div/h2/a/text()')
urls = tree.xpath('//div[@class="posts-row ajaxpager"]/posts/div/h2/a/@href')
for title, url in zip(titles, urls):
zu_title_urls = url
return zu_title_urls
# 得到图片url
def get_urls(urls):
res = requests.get(url=urls, headers=headers).text
tree = etree.HTML(res)
urls_list = tree.xpath('//ul/li/figure/a/@box-img')
return urls_list
def save(name, url):
if not os.path.exists('图库/' + name):
os.mkdir('图库/' + name)
# 得到图片url
urls_list = get_urls(url)
a = 1
for i in urls_list:
jpgpath = '图库/' + name + '/' + str(a) + '.jpg'
if os.path.exists(jpgpath):
a += 1
continue
res = requests.get(url=i, headers=headers).content
# print(a)
with open(jpgpath, 'wb') as f:
f.write(res)
a += 1
print(name + '下载完成!!!!!!!')
def main():
if not os.path.exists('图库'):
os.mkdir('图库')
with ThreadPoolExecutor(max_workers=thread_num) as pool:
for pepa in range(1, 666):
# 获取图组链接
print(f'............正在下载第{pepa}页............')
zu_title_urls = get_zu_urls(pepa)
if zu_title_urls == 0:
break
for name, url in zu_title_urls.items():
# save(name, url)
pool.submit(save, name, url)
print('全部下载完成')
if __name__ == '__main__':
main() 实在看不懂代码 哈哈,好玩的很 感谢分享 学习到了,感谢分享! 大佬厉害
厉害厉害 80233 发表于 2022-4-18 20:10
对楼主的代码做了一些修改:
1,引入线程池,可以自定义线程数量,默认为8
2,已下载的图片不再重复下载 ...
学习到了,多谢~ 80233 发表于 2022-4-18 20:10
对楼主的代码做了一些修改:
1,引入线程池,可以自定义线程数量,默认为8
2,已下载的图片不再重复下载 ...
感谢楼主和您分享,运行完美。{:301_993:}
页:
[1]
2