小白爬妹子图,大佬绕道
本帖最后由 被遗忘的路人 于 2020-5-9 13:09 编辑首先我连小白都算不上, 因为我没写过python,我是搞前端的
其次发现这个玩意的语法和JS很像
加上借鉴了一下这个老哥的帖子 ,修改了一些问题, https://www.52pojie.cn/thread-1168386-1-1.html
为啥要修改呢? 我前一天晚上开着电脑爬图 ,第二天早上激动地跑去看去了,爬了十几个文件夹,因为有特殊符号导致报错停了, 然后我稍为的修改了一下,嘻嘻!
第一版:2020年5月8日
1.三秒爬一张图片,防止被服务器禁用IP,我公司电脑因为没设置时间,IP被禁了好想;
2.自己想爬多少页就爬多少页,可以修改,更方便;
3.碰到特殊符号会自动去掉,因为windows新建文件夹不能有个别的特殊符号;
import requests
from lxml import etree
import os
import time
import re
# 存储图片的文件夹名称
# folder = "图片"
# if not os.path.exists(folder):
# os.mkdir(folder)
# 开始页
play_page = 1
# 结束页
and_page = 0
# 每爬取一张图片停顿时间
time_sleep = 3
# 爬图片的网址
post_url = "https://www.mzitu.com/"
regEx = '[/\:*?"<>|]'
# 头部信息
header = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
"referer": "https://www.mzitu.com/",
"Connection": "close"
}
# 解析网页并转成文本
post_url_text = requests.get(post_url, headers=header).text
# 把解析好的网页转成xpath可爬取类型
html = etree.HTML(post_url_text)
if and_page <= 0:
# 倒数第二个是总页数
page_number = int(html.xpath('//*[@class="nav-links"]/a/text()')[-2])
else:
page_number = and_page
while play_page <= page_number:
# 开始爬取的新链接
new_post_url = 'https://www.mzitu.com/page' + '/' + str(play_page)
# 解析爬取的新链接并转成文本
new_post_url_text = requests.get(new_post_url, headers=header).text
# 把解析好的网页转成xpath可爬取类型
new_html = etree.HTML(new_post_url_text)
# 获取列表中的图集url
new_list_page_url = new_html.xpath('//*[@id="pins"]/li/a/@href')
for image_url in new_list_page_url:
# 解析网页并转成文本
image_url_text = requests.get(image_url, headers=header).text
# 把解析好的网页转成xpath可爬取类型
image_html = etree.HTML(image_url_text)
# 获取套图有多少页图片
max_image_page = image_html.xpath('//*[@class="pagenavi"]/a/span/text()')
# 获取套图名字
image_name = re.sub(regEx, '', (''.join(image_html.xpath('//*[@class="main-image"]/p/a/img/@alt'))))
# 用套图的名字新建文件夹存图片
if not os.path.exists(image_name):
os.mkdir(image_name)
image_i = 1# 设置图片开始
while image_i <= int(max_image_page[-2]):
# 每爬取一张图片停一下,给网站一个休息时间
time.sleep(time_sleep)
# 真实图片地址
real_image_url = image_url + '/' + str(image_i)
# 解析网页并转成文本
real_image_url_r = requests.get(real_image_url, headers=header).text
# 把解析好的网页转成xpath可爬取类型
real_image_url_html = etree.HTML(real_image_url_r)
'''此处开始处理图片下载'''
# 下载图片的标题 alt + 下标命名
image_alt = real_image_url_html.xpath('//*[@class="main-title"]/text()')
# 图片的src地址
image_src = real_image_url_html.xpath('//*[@class="main-image"]/p/a/img/@src')
# 循环获取图片名字
for pic_name in image_alt:
# 获取要下载图片的地址
for pic_url in image_src:
with open(image_name + '/' + (re.sub(regEx, '', pic_name)) + '.jpg', 'wb') as pic:
pic.write(requests.get(pic_url, headers=header).content)
# 图片名
print(pic_name)
image_i = image_i + 1
# 循环下一页
play_page = play_page + 1
第二版:2020年5月9日
1.自定义爬取类行;
2.增加搜索查找;
3.自定义起始页;
4.自定义结束页;
import requests
from lxml import etree
import os
import time
import re
play_page = 0
and_page = 0
time_sleep = 3
regEx = '[/\:*?"<>|]'
def set_input(url, default_url, default_type):
user_url = int(float(input("请输入你要爬取的网站?(" + url + "):")))
if (user_url > 0) and (user_url <= len(default_url)):
print("你要爬取的是:\033 + "\033[0m")
eval('post_' + str(user_url))(default_url, default_type)
else:
print("输入错误,请重新输入!")
set_input(url, default_url, default_type)
def post_1(default_url, default_type):
type = ""
header = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
"referer": default_url,
"Connection": "close"
}
default_type_number = 1
for index in range(len(default_type)):
type = type + str(index + 1) + "." + default_type + ";"
grab_type = int(float(input("请输入你要爬取的类型?(" + type + "):")))
if grab_type == 1 or grab_type == 2:
print("你要爬取的类型:\033 + "\033[0m")
default_type_number = grab_type
else:
print("输入错误,默认按照顺序爬取!")
if default_type_number == 1:
default_url_html = etree.HTML(requests.get(default_url, headers=header).text)
page_number = int(default_url_html.xpath('//*[@class="nav-links"]/a/text()')[-2])
user_play_page = int(float(input("请输入起始页:")))
if user_play_page <= page_number and user_play_page > 0:
play_page = user_play_page
else:
print("页数输入错误,默认第一页开始!")
play_page = 1
user_and_page = int(float(input("请输入结束页:")))
if (play_page <= user_and_page) and (user_and_page <= page_number):
and_page = user_and_page
else:
print("页数输入错误,默认最后一页结束!")
and_page = page_number
print("即将开始爬取,请稍等...")
while play_page <= and_page:
post_url = default_url + 'page/' + str(play_page)
post_url_html = etree.HTML(requests.get(post_url, headers=header).text)
post_url_html_page = post_url_html.xpath('//*[@id="pins"]/li/a/@href')
post_1_down(post_url_html_page, header)
play_page = play_page + 1
else:
user_search = input("请输入你要搜索的内容!")
print("即将开始爬取,请稍等...")
search_post_url = default_url + 'search/' + user_search + '/'
search_post_url_html = etree.HTML(requests.get(search_post_url, headers=header).text)
search_post_url_html_page = int(search_post_url_html.xpath('//*[@class="pagination"]/div/a/text()')[-2])
if search_post_url_html_page >= 1:
search_image_page = 1
while search_image_page <= search_post_url_html_page:
search_image_url = search_post_url + 'page/' + str(search_image_page)
search_image_url_html = etree.HTML(requests.get(search_image_url, headers=header).text)
search_image_url_html_page = search_image_url_html.xpath('//*[@id="pins"]/li/a/@href')
post_1_down(search_image_url_html_page, header)
search_image_page = search_image_page + 1
else:
print("暂无搜索结果!")
def post_1_down(url_page, header):
for image_url in url_page:
image_html = etree.HTML(requests.get(image_url, headers=header).text)
max_image_page = image_html.xpath('//*[@class="pagenavi"]/a/span/text()')
image_name = re.sub(regEx, '', (''.join(image_html.xpath('//*[@class="main-image"]/p/a/img/@alt'))))
if not os.path.exists(image_name):
os.mkdir(image_name)
image_i = 1
while image_i <= int(max_image_page[-2]):
time.sleep(time_sleep)
real_image_url = image_url + '/' + str(image_i)
real_image_url_r = requests.get(real_image_url, headers=header).text
real_image_url_html = etree.HTML(real_image_url_r)
image_alt = real_image_url_html.xpath('//*[@class="main-title"]/text()')
image_src = real_image_url_html.xpath('//*[@class="main-image"]/p/a/img/@src')
for pic_name in image_alt:
for pic_url in image_src:
with open(image_name + '/' + (re.sub(regEx, '', pic_name)) + '.jpg', 'wb') as pic:
pic.write(requests.get(pic_url, headers=header).content)
print("正在爬取:" + pic_name)
image_i = image_i + 1
def post_2(default_url, default_type):
print("这是第二个的网站:" + default_url, default_type)
url = ""
default_url = ['https://www.mzitu.com/', '正在开发']
default_type = ['顺序爬取', '搜索爬取']
for index in range(len(default_url)):
url = url + str(index + 1) + "." + default_url + ";"
set_input(url, default_url, default_type)
现在上班一天没事干,研究python!后面如果有时间会加上搜索之类的功能! 大佬,只能爬起始页,再就报错。
Traceback (most recent call last):
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
raise err
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
sock.connect(sa)
TimeoutError: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
conn.connect()
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 308, in connect
conn = self._new_conn()
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000001B445964B20>: Failed to establish a new connection: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
retries = retries.increment(
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\retry.py", line 439, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.mzitu.com', port=443): Max retries exceeded with url: /228185/5 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001B445964B20>: Failed to establish a new connection: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。')) 即将开始爬取,请稍等...
Traceback (most recent call last):
File "C:\Users\Administrator\Desktop\py.py", line 185, in <module>
set_input(url, default_url, default_type)
File "C:\Users\Administrator\Desktop\py.py", line 24, in set_input
eval('post_' + str(user_url))(default_url, default_type)
File "C:\Users\Administrator\Desktop\py.py", line 112, in post_1
search_post_url_html_page = int(search_post_url_html.xpath('//*[@class="pagination"]/div/a/text()')[-2])
IndexError: list index out of range
搜索下载提示这个 是什么问题呢?? 厉害了呀老哥,这一手行云流水 hao290558809 发表于 2020-5-8 22:03
厉害了呀老哥,这一手行云流水
真的,我也是小白!python刚开始看,感觉应该不难吧! 有图吗,想看看{:301_1004:} 老哥666,一通操作猛如虎 冷诗烟 发表于 2020-5-8 22:08
有图吗,想看看
https://attach.52pojie.cn//forum/202005/08/221234m0tto89otzot8vsq.png?lhttps://attach.52pojie.cn//forum/202005/08/221232pm3uwulsmyujwn1s.png?l
来给你看看 被遗忘的路人 发表于 2020-5-8 22:12
来给你看看
分享图片,哈哈 认真研究学习,希望以后能用得上,感谢 感谢分享 就羡慕这种 动手能力强的人