萌新刚练手写的一个图片爬虫

Zeaf · 发表于 2020-3-8 14:54

第一次发这种贴，大佬勿喷，爬出来的图片质量也不高，有待继续改进爬虫
参考了某大佬的源码~（不过我也不记得是谁了{:1_925:} ）
网站原地址：https://isorepublic.com/
源码如下

[Python] 纯文本查看 复制代码

# -*- coding: utf-8 -*-
"""
Created on Sun Mar  8 13:09:19 2020

@author: Zeaf
"""

"""导入模块"""
import requests  # 导入requests库
import re  # 导入正则表达式库
import os  # 导入操作系统库
import time  # 导入时间库
print('若想停止请按ctrl+C')
if not os.path.exists('iso图片'):  # 判断文件夹是否存在，如果不存在：
    os.mkdir('iso图片')  # 创建一个文件夹
user = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get('https://isorepublic.com/',headers=user)  # 用requests库的get函数访问总网页，用headers进行伪装，获得所有文章网址
html = response.text  # 用文本显示访问网页得到的内容
urls_wz = re.findall('https://isorepublic.com/wp-content/uploads/.*?.jpg', html)  # 用正则表达式获得文章的所有网址
#print(urls_wz)  # 打印显示所有网址
names = re.findall('title="(.*?)" class=', html)  # 正则表达式创建目录名字
#print(names)#显示所有标题
if len(urls_wz):
    for url_wz,name in zip(urls_wz,names):  # 循环获取每一个图片网址
        # 图片的名字
        time.sleep(5)  # 设定5秒延时,太快会GG
        response = requests.get(url_wz, headers=user)  # 用requeste库的get函数访问图片网址，用headers进行伪装
        print("正在保存图片中……")
        with open('iso图片/'+name+'.jpg', 'wb') as f:  # 用wb模式打开创建文件，w写模式
            f.write(response.content)  # 写入二进制文件内容
    print("保存第1页图片完毕！")
    for i in range(2,100): 
        response = requests.get('https://isorepublic.com/page/'+str(i)+'/',headers=user)
        html = response.text  # 用文本显示访问网页得到的内容
        urls_wz = re.findall('https://isorepublic.com/wp-content/uploads/.*?.jpg', html)
        names = re.findall('title="(.*?)" class=', html)
        if len(urls_wz):
            for url_wz,name in zip(urls_wz,names):  # 循环获取每一个图片网址
                # 图片的名字
                time.sleep(5)  # 设定5秒延时
                response = requests.get(url_wz, headers=user)  # 用requeste库的get函数访问图片网址，用headers进行伪装
                print("正在保存图片中……")
                with open('iso图片/'+name+'.jpg', 'wb') as f:  # 用wb模式打开创建文件，w写模式
                    f.write(response.content)  # 写入二进制文件内容
            print("保存第"+str(i)+"页图片完毕！")
        else:
            print('保存失败')
else:
    print('保存失败')

Zeaf · 发表于 2020-3-9 22:28

wenwu.he 发表于 2020-3-8 18:41
回头改改把下原图，这个网站图片质量不错。可以当壁纸用。

交作业...但不知道是不是我网络原因，爬了几张就一直保存中...很久也没个结果

[Python] 纯文本查看 复制代码

"""
Created on Mon Mar  9 21:20:07 2020

@author: Zeaf
"""

import requests  # 导入requests库
import re  # 导入正则表达式库
import os  # 导入操作系统库
import time  # 导入时间库

def save():#定义一个函数用来保存图片
    for url,name in zip(urls,names):
        response = requests.get(url, headers=user)  # 用requeste库的get函数访问图片网址，用headers进行伪装
        with open('iso图片/'+name+'.jpg', 'wb') as f:  # 用wb模式打开创建文件，w写模式
            f.write(response.content)  # 写入二进制文件内容  
            f.close
            print('保存图片成功！')

print('若想停止请按ctrl+C')
if not os.path.exists('iso图片'):  # 判断文件夹是否存在，如果不存在：
    os.mkdir('iso图片')  # 创建一个文件夹
user = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get('https://isorepublic.com/',headers=user)  # 用requests库的get函数访问总网页，用headers进行伪装，获得源码
html = response.text  # 用文本显示访问网页得到的内容
urls_in = re.findall('<a href="(https://isorepublic.com/photo/.*?)" title=', html)  # 用正则表达式获得进入图片的所有网址
if len(urls_in):#判断提取的网址是否为空
    for url_in in urls_in:#循环获取每一个图片的进入网址
        time.sleep(3)#快了GG
        response = requests.get(url_in,headers=user)#打开图片进入网址获取源码
        html = response.text
        urls = re.findall('<a href="(https://isorepublic.com/wp-content/uploads/.*?\.jpg)" title=', html)  # 用正则表达式获得图片的网址
        names = re.findall('.jpg" title="Download (.*?)"', html)  # 正则表达式创建图片名字
        if len(urls):
            save()#调用之前定义的函数保存
        else:
            print('获取图片网址失败！')
    print('保存第1页图片完毕！')
else:
    print('获取进入网址失败！')        
#获取多页内容
for i in range(2,100):#循环输出2-99
    response = requests.get('https://isorepublic.com/page/'+str(i)+'/',headers=user)  # 用requests库的get函数访问总网页，用headers进行伪装，获得源码
    html = response.text  # 用文本显示访问网页得到的内容
    urls_in = re.findall('<a href="(https://isorepublic.com/photo/.*?)" title=', html)  # 用正则表达式获得进入图片的所有网址
    if len(urls_in):#判断提取的网址是否为空
        for url_in in urls_in:#循环获取每一个图片的进入网址
            time.sleep(3)#快了GG
            response = requests.get(url_in,headers=user)#打开图片进入网址获取源码
            html = response.text
            urls = re.findall('<a href="(https://isorepublic.com/wp-content/uploads/.*?\.jpg)" title=', html)  # 用正则表达式获得图片的网址
            names = re.findall('.jpg" title="Download (.*?)"', html)  # 正则表达式创建图片名字
            if len(urls):
                save()#调用之前定义的函数保存
            else:
                print('获取图片网址失败！')
        print('保存第'+str(i)+'页图片完毕！')
    else:
        print('获取进入网址失败！')

Zeaf · 发表于 2020-3-8 16:36

加了个函数，大概修改了一下注释~（其他估计以我现在的水平改不了

）

[Python] 纯文本查看 复制代码

# -*- coding: utf-8 -*-
"""
Created on Sun Mar  8 13:09:19 2020

@author: Zeaf
"""

"""导入模块"""
import requests  # 导入requests库
import re  # 导入正则表达式库
import os  # 导入操作系统库
import time  # 导入时间库

def save():
    for url,name in zip(urls, names):  # 循环获取每一个图片网址和标题
        time.sleep(5)  # 设定5秒延时,太快会被检测
        response = requests.get(url, headers=user)  # 用requeste库的get函数访问图片网址，用headers进行伪装
        print("正在保存图片中……")
        with open('iso图片/'+name+'.jpg', 'wb') as f:  # 用wb模式打开创建文件，w写模式
            f.write(response.content)  # 写入二进制文件内容  
            
print('若想停止请按ctrl+C')
if not os.path.exists('iso图片'):  # 判断文件夹是否存在，如果不存在：
    os.mkdir('iso图片')  # 创建一个文件夹
user = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get('https://isorepublic.com/',headers=user)  # 用requests库的get函数访问总网页，用headers进行伪装，获得源码
html = response.text  # 用文本显示访问网页得到的内容
urls = re.findall('https://isorepublic.com/wp-content/uploads/.*?.jpg', html)  # 用正则表达式获得图片的所有网址
names = re.findall('title="(.*?)" class=', html)  # 正则表达式创建图片名字
if len(urls):#判断提取的地址是否为空
    save()
    print("保存第1页图片完毕！")
    for i in range(2,100): #循环输出2-99（实际上还不止100页）
        response = requests.get('https://isorepublic.com/page/'+str(i)+'/',headers=user)
        html = response.text  # 用文本显示访问网页得到的内容
        urls = re.findall('https://isorepublic.com/wp-content/uploads/.*?.jpg', html)
        names = re.findall('title="(.*?)" class=', html)
        if len(urls):
            save()
            print("保存第"+str(i)+"页图片完毕！")
        else:
            print('保存失败')
else:
    print('保存失败')

wanyan1997 · 发表于 2020-3-8 15:11

看起来好像很厉害

zyw2365 · 发表于 2020-3-8 15:11

用心讨论，共获提升！

人在江湖飘 · 发表于 2020-3-8 15:12

我也想玩，可惜不会，又没人出教程，哎，等有空慢慢学

cutPaper · 发表于 2020-3-8 15:16

有不少重复的代码，建议写一个函数直接来调用，可以省些事

vagrantear · 发表于 2020-3-8 15:24

过来围观学习一波

2014晴天 · 发表于 2020-3-8 15:54

本帖最后由 2014晴天于 2020-3-8 15:56 编辑

用XPath会不会更好一点呢？单页举例.


import os
import requests
from lxml import etree

if not os.path.exists('iso图片'): 
os.mkdir('iso图片')

url = "https://isorepublic.com/"
hd = {"user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}

r = requests.get(url,headers = hd).text
html = etree.HTML(r)

jpg_downs = html.xpath("//div[@id='photo-grid']/a/img/@src")
jpg_names = html.xpath("//div[@id='photo-grid']/a/p/text()")

if len(jpg_downs):
for jpg_down,jpg_name in zip(jpg_downs,jpg_names):
res = requests.get(jpg_down, headers=hd)
with open('iso图片/'+jpg_name+'.jpg', 'wb') as f: 
f.write(res.content)

Zeaf · 发表于 2020-3-8 15:55

cutPaper 发表于 2020-3-8 15:16
有不少重复的代码，建议写一个函数直接来调用，可以省些事

有道理，多谢大佬指点

Zeaf · 发表于 2020-3-8 15:55

人在江湖飘发表于 2020-3-8 15:12
我也想玩，可惜不会，又没人出教程，哎，等有空慢慢学

论坛好像很多?之前看到很多人发什么学习笔记，资源区也有很多相关书籍

Zeaf · 发表于 2020-3-8 16:01

2014晴天发表于 2020-3-8 15:54
[md]### **用XPath会不会更好一点呢？单页举例.**

```

萌新，xpath没用过，目前只会正则表达式

以后再说吧，多谢指点

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 萌新刚练手写的一个图片爬虫

免费评分

免费评分

免费评分

用XPath会不会更好一点呢？单页举例.