小白爬妹子图,大佬绕道

被遗忘的路人 · 发表于 2020-5-8 22:01

本帖最后由被遗忘的路人于 2020-5-9 13:09 编辑

首先我连小白都算不上, 因为我没写过python,我是搞前端的

其次发现这个玩意的语法和JS很像

加上借鉴了一下这个老哥的帖子 ,修改了一些问题, https://www.52pojie.cn/thread-1168386-1-1.html

为啥要修改呢? 我前一天晚上开着电脑爬图 ,第二天早上激动地跑去看去了,爬了十几个文件夹,因为有特殊符号导致报错停了, 然后我稍为的修改了一下,嘻嘻!

第一版:2020年5月8日
1.三秒爬一张图片,防止被服务器禁用IP,我公司电脑因为没设置时间,IP被禁了好想;
2.自己想爬多少页就爬多少页,可以修改,更方便;
3.碰到特殊符号会自动去掉,因为windows新建文件夹不能有个别的特殊符号;

[Python] 纯文本查看 复制代码

import requests
from lxml import etree
import os
import time
import re

# 存储图片的文件夹名称
# folder = "图片"
# if not os.path.exists(folder):
#     os.mkdir(folder)

# 开始页
play_page = 1

# 结束页
and_page = 0

# 每爬取一张图片停顿时间
time_sleep = 3

# 爬图片的网址
post_url = "https://www.mzitu.com/"

regEx = '[/\:*?"<>|]'

# 头部信息
header = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
    "referer": "https://www.mzitu.com/",
    "Connection": "close"
}

# 解析网页并转成文本
post_url_text = requests.get(post_url, headers=header).text

# 把解析好的网页转成xpath可爬取类型
html = etree.HTML(post_url_text)

if and_page <= 0:
    # 倒数第二个是总页数
    page_number = int(html.xpath('//*[@class="nav-links"]/a/text()')[-2])
else:
    page_number = and_page

while play_page <= page_number:
    # 开始爬取的新链接
    new_post_url = 'https://www.mzitu.com/page' + '/' + str(play_page)

    # 解析爬取的新链接并转成文本
    new_post_url_text = requests.get(new_post_url, headers=header).text

    # 把解析好的网页转成xpath可爬取类型
    new_html = etree.HTML(new_post_url_text)

    # 获取列表中的图集url
    new_list_page_url = new_html.xpath('//*[@id="pins"]/li/a/@href')

    for image_url in new_list_page_url:
        # 解析网页并转成文本
        image_url_text = requests.get(image_url, headers=header).text

        # 把解析好的网页转成xpath可爬取类型
        image_html = etree.HTML(image_url_text)

        # 获取套图有多少页图片
        max_image_page = image_html.xpath('//*[@class="pagenavi"]/a/span/text()')

        # 获取套图名字
        image_name = re.sub(regEx, '', (''.join(image_html.xpath('//*[@class="main-image"]/p/a/img/@alt'))))

        # 用套图的名字新建文件夹存图片
        if not os.path.exists(image_name):
            os.mkdir(image_name)

        image_i = 1  # 设置图片开始
        while image_i <= int(max_image_page[-2]):
            # 每爬取一张图片停一下,给网站一个休息时间
            time.sleep(time_sleep)

            # 真实图片地址
            real_image_url = image_url + '/' + str(image_i)

            # 解析网页并转成文本
            real_image_url_r = requests.get(real_image_url, headers=header).text

            # 把解析好的网页转成xpath可爬取类型
            real_image_url_html = etree.HTML(real_image_url_r)

            '''此处开始处理图片下载'''
            # 下载图片的标题   alt + 下标命名
            image_alt = real_image_url_html.xpath('//*[@class="main-title"]/text()')

            # 图片的src地址
            image_src = real_image_url_html.xpath('//*[@class="main-image"]/p/a/img/@src')

            # 循环获取图片名字
            for pic_name in image_alt:
                # 获取要下载图片的地址
                for pic_url in image_src:
                    with open(image_name + '/' + (re.sub(regEx, '', pic_name)) + '.jpg', 'wb') as pic:
                        pic.write(requests.get(pic_url, headers=header).content)
                        # 图片名
                        print(pic_name)

            image_i = image_i + 1

    # 循环下一页
    play_page = play_page + 1

第二版:2020年5月9日
1.自定义爬取类行;
2.增加搜索查找;
3.自定义起始页;
4.自定义结束页;

[Python] 纯文本查看 复制代码

import requests
from lxml import etree
import os
import time
import re

play_page = 0

and_page = 0

time_sleep = 3

regEx = '[/\:*?"<>|]'


def set_input(url, default_url, default_type):
    user_url = int(float(input("请输入你要爬取的网站?(" + url + "):")))

    if (user_url > 0) and (user_url <= len(default_url)):

        print("你要爬取的是:\033[1;35m" + default_url[user_url - 1] + "\033[0m")

        eval('post_' + str(user_url))(default_url[user_url - 1], default_type)

    else:

        print("输入错误,请重新输入!")

        set_input(url, default_url, default_type)

def post_1(default_url, default_type):
    type = ""

    header = {
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
        "referer": default_url,
        "Connection": "close"
    }

    default_type_number = 1

    for index in range(len(default_type)):
        type = type + str(index + 1) + "." + default_type[index] + ";"

    grab_type = int(float(input("请输入你要爬取的类型?(" + type + "):")))

    if grab_type == 1 or grab_type == 2:

        print("你要爬取的类型:\033[1;35m" + default_type[grab_type - 1] + "\033[0m")

        default_type_number = grab_type

    else:

        print("输入错误,默认按照顺序爬取!")

    if default_type_number == 1:

        default_url_html = etree.HTML(requests.get(default_url, headers=header).text)

        page_number = int(default_url_html.xpath('//*[@class="nav-links"]/a/text()')[-2])

        user_play_page = int(float(input("请输入起始页:")))

        if user_play_page <= page_number and user_play_page > 0:

            play_page = user_play_page

        else:

            print("页数输入错误,默认第一页开始!")

            play_page = 1

        user_and_page = int(float(input("请输入结束页:")))

        if (play_page <= user_and_page) and (user_and_page <= page_number):

            and_page = user_and_page

        else:

            print("页数输入错误,默认最后一页结束!")

            and_page = page_number

        print("即将开始爬取,请稍等...")

        while play_page <= and_page:

            post_url = default_url + 'page/' + str(play_page)

            post_url_html = etree.HTML(requests.get(post_url, headers=header).text)

            post_url_html_page = post_url_html.xpath('//*[@id="pins"]/li/a/@href')

            post_1_down(post_url_html_page, header)

            play_page = play_page + 1

    else:

        user_search = input("请输入你要搜索的内容!")

        print("即将开始爬取,请稍等...")

        search_post_url = default_url + 'search/' + user_search + '/'

        search_post_url_html = etree.HTML(requests.get(search_post_url, headers=header).text)

        search_post_url_html_page = int(search_post_url_html.xpath('//*[@class="pagination"]/div/a/text()')[-2])

        if search_post_url_html_page >= 1:

            search_image_page = 1

            while search_image_page <= search_post_url_html_page:
                search_image_url = search_post_url + 'page/' + str(search_image_page)

                search_image_url_html = etree.HTML(requests.get(search_image_url, headers=header).text)

                search_image_url_html_page = search_image_url_html.xpath('//*[@id="pins"]/li/a/@href')

                post_1_down(search_image_url_html_page, header)

                search_image_page = search_image_page + 1
        else:

            print("暂无搜索结果!")

def post_1_down(url_page, header):

    for image_url in url_page:

        image_html = etree.HTML(requests.get(image_url, headers=header).text)

        max_image_page = image_html.xpath('//*[@class="pagenavi"]/a/span/text()')

        image_name = re.sub(regEx, '', (''.join(image_html.xpath('//*[@class="main-image"]/p/a/img/@alt'))))

        if not os.path.exists(image_name):
            os.mkdir(image_name)

        image_i = 1

        while image_i <= int(max_image_page[-2]):

            time.sleep(time_sleep)

            real_image_url = image_url + '/' + str(image_i)

            real_image_url_r = requests.get(real_image_url, headers=header).text

            real_image_url_html = etree.HTML(real_image_url_r)

            image_alt = real_image_url_html.xpath('//*[@class="main-title"]/text()')

            image_src = real_image_url_html.xpath('//*[@class="main-image"]/p/a/img/@src')

            for pic_name in image_alt:

                for pic_url in image_src:
                    with open(image_name + '/' + (re.sub(regEx, '', pic_name)) + '.jpg', 'wb') as pic:
                        pic.write(requests.get(pic_url, headers=header).content)

                        print("正在爬取:" + pic_name)

            image_i = image_i + 1

def post_2(default_url, default_type):
    print("这是第二个的网站:" + default_url, default_type)



url = ""

default_url = ['https://www.mzitu.com/', '正在开发']

default_type = ['顺序爬取', '搜索爬取']

for index in range(len(default_url)):
    url = url + str(index + 1) + "." + default_url[index] + ";"

set_input(url, default_url, default_type)

现在上班一天没事干,研究python!后面如果有时间会加上搜索之类的功能!

xiaohanGG · 发表于 2020-6-22 13:50

大佬，只能爬起始页，再就报错。
Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
conn = connection.create_connection(
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
raise err
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
sock.connect(sa)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
conn.connect()
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 308, in connect
conn = self._new_conn()
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x000001B445964B20>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
retries = retries.increment(
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\site-packages\urllib3\util\retry.py", line 439, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.mzitu.com', port=443): Max retries exceeded with url: /228185/5 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001B445964B20>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。'))

含笑步步颠 · 发表于 2020-5-9 15:29

即将开始爬取,请稍等...
Traceback (most recent call last):
  File "C:\Users\Administrator\Desktop\py.py", line 185, in <module>
set_input(url, default_url, default_type)
  File "C:\Users\Administrator\Desktop\py.py", line 24, in set_input
eval('post_' + str(user_url))(default_url[user_url - 1], default_type)
  File "C:\Users\Administrator\Desktop\py.py", line 112, in post_1
search_post_url_html_page = int(search_post_url_html.xpath('//*[@class="pagination"]/div/a/text()')[-2])
IndexError: list index out of range

搜索下载提示这个是什么问题呢？？

hao290558809 · 发表于 2020-5-8 22:03

厉害了呀老哥，这一手行云流水

被遗忘的路人 · 发表于 2020-5-8 22:05

hao290558809 发表于 2020-5-8 22:03
厉害了呀老哥，这一手行云流水

真的,我也是小白!python刚开始看,感觉应该不难吧!

冷诗烟 · 发表于 2020-5-8 22:08

有图吗，想看看

余佳卓 · 发表于 2020-5-8 22:10

老哥666，一通操作猛如虎

被遗忘的路人 · 发表于 2020-5-8 22:12

冷诗烟发表于 2020-5-8 22:08
有图吗，想看看

来给你看看

2321490 · 发表于 2020-5-8 22:16

被遗忘的路人发表于 2020-5-8 22:12
来给你看看

分享图片，哈哈

ladiosfei · 发表于 2020-5-8 22:17

认真研究学习，希望以后能用得上，感谢

ckypamym119 · 发表于 2020-5-8 22:19

感谢分享

badou0332 · 发表于 2020-5-8 22:47

就羡慕这种动手能力强的人

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 小白爬妹子图,大佬绕道

免费评分