吾爱破解 - 52pojie.cn

 找回密码
 注册[Register]

QQ登录

只需一步,快速开始

查看: 4697|回复: 18
收起左侧

[Python 转载] python 爬虫 爬取vmgirls美图

  [复制链接]
咕嚕靈啵 发表于 2022-5-12 19:45
本帖最后由 咕嚕靈啵 于 2022-5-12 19:48 编辑

初次接触python和爬虫,就写了一个,效率比较低(而且不知道为什么会爬取到相同内容的东西,有些内容下载不成功)
求大神指点改进
求大神指点改进
求大神指点改进
仅供学习交流与娱乐哈~
[Python] 纯文本查看 复制代码
from nturl2path import pathname2url
import requests
import os
import time
import re

def send_requests(url_req):#网页申请模块
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }
    response = requests.get(url=url_req, headers=headers)
    response.encoding = 'utf-8'
    response.close()
    return response.text
    

def find_url(url_resp):#第一个找,找到每一个页面
    obj = re.compile(r' <a class="media-content" target="_blank" href="(?P<url>.*?)" title="(?P<title>.*?)"',re.S)
    result = obj.findall(url_resp)
    return(result)

def find_img(url_resp_son):
    obj = re.compile(r'<a rel="nofollow" href="(?P<img>.*?)" alt="(?P<title>.*?)"',re.S)
    result_img = obj.findall(url_resp_son)
    return(result_img)

def dr_img(url_img,path,title):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }
    pathname = path +"\\" + title
    if not os.path.exists(pathname):
        img_resp = requests.get(url = url_img , headers=headers)
        img_resp.content
        img_resp.close()

        try:
            with open(path +"\\" + title,mode='wb') as f:
                f.write(img_resp.content)
                print(title + ' over!')
        except TypeError:
            print("TypeError")
    else:
        print(f"{title} has exist!")


def creat_main_file(filename):
    path = r"D:\vs\爬虫\美图爬虫\vmgirls"
    try:
        os.makedirs(path + "\\" + filename )
        print("已创建文件夹")
    except FileExistsError:
        print("已经有该文件夹")
def creat_file(file_name_son):
    try:
        path = r"D:\vs\爬虫\美图爬虫\vmgirls\pure"
        os.makedirs(path + "\\" + file_name_son)
    except FileExistsError:
        print("已有该文件夹")

def create_son_file(file_son_son_name,i):
    try:
        path = rf"D:\vs\爬虫\美图爬虫\vmgirls\pure\第{i}页"
        os.makedirs(path + "\\" + file_son_son_name)
        print(f"{file_son_son_name}已创建")
    except FileExistsError:
        print("已有该文件夹")

creat_main_file("pure")
for i in range(1,98):
    creat_file(f"第{i}页")
    url = "https://www.vmgirls.com/pure/page/" + str(i) +"/"
    resp = send_requests(url)#最原始的网页
    url_sonss = find_url(resp)#经过挑选混为一坛的网页
    print(url_sonss)
    for url_sons in url_sonss:
        print (url_sons)#url_sons是网页+标题
        url_son_t = url_sons[0]#子网页
        url_title = url_sons[1]#子网页标题
        print(url_son_t)
        print(url_title)

        resp_son = send_requests(url_son_t)#子网页源码
        result_imgs = find_img(resp_son)#子网页图片混为一谈

        create_son_file(url_title,i)

        for url_img in result_imgs:#每一张图片网页
            img_url = url_img[0]
            print(img_url)
            last_name = img_url.split("/")[-1]
            dr_img(img_url,rf"D:\vs\爬虫\美图爬虫\vmgirls\pure\第{i}页\{url_title}",last_name)
        time.sleep(2)
    time.sleep(2)
    time.sleep(2)


图片1

图片1

图片2

图片2

免费评分

参与人数 5吾爱币 +4 热心值 +4 收起 理由
zhangzsf + 1 + 1 谢谢@Thanks!
lian52yy + 1 + 1 用心讨论,共获提升!
beyond1994 + 1 用心讨论,共获提升!
8382107 + 1 + 1 热心回复!
Pengruirui + 1 我很赞同!

查看全部评分

发帖前要善用论坛搜索功能,那里可能会有你要找的答案或者已经有人发布过相同内容了,请勿重复发帖。

话痨司机啊 发表于 2022-5-13 01:20
paxj168 发表于 2022-5-14 14:02
[Python] 纯文本查看 复制代码
import requests
import os
import time
import re


class meinuimages:
    """美女图片"""

    # 全局
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
        }

    # 列表页URL
    def get_url_list(self):
        """ 获取URL列表 """
        url_titless = []
        for i in range(1, 3):
            urls = "https://www.vmgirls.com/fresh/page/{}/".format(i)
            # print('当前第%s页=url:%s' % (i, urls))
            try:
                res = requests.get(urls, self.headers).text
                time.sleep(2)
                obj = re.compile(
                    r' <a class="media-content" target="_blank" href="(?P<url>.*?)" title="(?P<title>.*?)"',
                    re.S)
                url_title = obj.findall(res)
                url_titles = {
                    'page': i,
                    'url': urls,
                    'list_title': url_title
                }
                url_titless.append(url_titles)
            except:
                print('请示错误当前第%s页=url:%s' % (i, urls))

        return url_titless

    # 详情图片
    def get_url_img(self):
        datas = []
        for ss in self.get_url_list():
            # print('当前第{}页: url={}'.format(ss['page'], ss['url']))
            img_list = []
            for hh in ss['list_title']:
                html = requests.get(url=hh[0], headers=self.headers).text
                obj = re.compile(r'<a rel="nofollow" href="(?P<img>.*?)" alt="(?P<title>.*?)"', re.S)
                result_img = obj.findall(html)
                # print(result_img)
                lists = {'title': hh[1], 'img': result_img}
                img_list.append(lists)
            datas.append(img_list)
        return datas

    # 保存图片
    def save_images(self):

        global path
        for filename in self.get_url_img()[0]:

            try:
                path = r"D:\美图爬虫"
                os.makedirs(path + "\\" + filename['title'])
                print("已创建文件夹")
            except FileExistsError:
                print("已经有该文件夹")

            for images in filename['img']:
                img_resp = requests.get(url=images[0], headers=self.headers)
                img_name = images[0].split('/')[-1]
                try:
                    with open(path + "\\" + filename['title'] + "\\" + img_name, mode='wb') as f:
                        f.write(img_resp.content)
                        print(filename['title'], img_name + ' 保存成功')
                except TypeError:
                    print("保存失败:{}".format(filename['title']))


if __name__ == '__main__':
    meinuimages().save_images()
bj9ye666 发表于 2022-5-13 00:35
tbloy 发表于 2022-5-13 01:44
过来学习支持一下。
tfrist 发表于 2022-5-13 01:56
爬虫 不错!
头像被屏蔽
shayu2021 发表于 2022-5-13 06:19
提示: 作者被禁止或删除 内容自动屏蔽
zhjm21 发表于 2022-5-13 07:51
我是来学习的!
XiaoZouYu 发表于 2022-5-13 08:47
来学习哈,大佬啊
he58394835 发表于 2022-5-13 09:04
慢慢继续探索吧
shou0823 发表于 2022-5-13 09:14
谢谢分享
您需要登录后才可以回帖 登录 | 注册[Register]

本版积分规则

返回列表

RSS订阅|小黑屋|处罚记录|联系我们|吾爱破解 - LCG - LSG ( 京ICP备16042023号 | 京公网安备 11010502030087号 )

GMT+8, 2024-11-25 04:40

Powered by Discuz!

Copyright © 2001-2020, Tencent Cloud.

快速回复 返回顶部 返回列表