Python批量爬取wallhaven高清壁纸并储存

无际发表于 2023-6-27 14:21

本帖最后由苏紫方璇于 2023-7-3 00:50 编辑

程序已经打包好，请自行领取：https://wwnt.lanzout.com/b04q3351i 密码:2u43
简介：
1、选好描述图片的关键词，进入下面的页面即为我们爬取的目标页面。可见图片是以的名称分组的，每组都是24张图片。
https://s1.ax1x.com/2023/06/27/pCaYehj.png
选择这个页面主要是因为好爬，因为它的网址是下面这种形式：https://wallhaven.cc/search?q=id%3A1&............&seed=pisTch&page=5,
只要改变[ page]这一个参数就可以轻松定位到我们想要的图片位置。
需要说明的是，爬取的对象并不是当前页面显示的这些图片，这些只是封面，清晰度非常低，我们要下载的是点进去后的高清图，具体操作参见程序。
最后提醒：因为是外国网站，而且图片的分辨率较高，一张图片大小往往几m十几m，所以有时候下载的很缓慢，甚至完成不了所有的下载任务。所以能不能爬成功随缘，一般是可以爬到的，就是爬不完整。
2、爬取到图片后，会自动在桌面上建立一个文件夹保存，分类方式是日期:

https://s1.ax1x.com/2023/06/27/pCatbdO.png
图片命名是

https://s1.ax1x.com/2023/06/27/pCatjWd.png
   需要注意，基于以上所述的储存方式，如果在同一天想要爬取多个不同标签描述的图片，即更改url，需要在更改url前将今天之内url更改前爬取的图片保存到其他位置，否则会被新    爬取的图片覆盖。当然有能力、有兴趣的伙伴还可以根据需求对程序进行相应的功能完善。
3、以下是面向对象的程序，可以根据需求自行更改，运行后输入要爬取的开始页和结束页就可以开始运行（开始和结束数字可相同）：
https://s1.ax1x.com/2023/06/27/pCaamZV.png
from lxml import etree
from time import sleep
from datetime import date
import requests
import os

class Picture():
#存放属性
def __init__(self) -> None:

   # self.url='https://wallhaven.cc/search?q=id%3A1&categories=110&purity=100&atleast=3440x1440&sorting=random&order=desc&seed=pisTch&page='
   self.url='https://wallhaven.cc/search?categories=011&purity=110&sorting=date_added&order=desc&ai_art_filter=0&page=' #可根据需求更改
   self.headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'}
#访问目标网站
def gets(self,url_page):
   url=self.url+str(url_page)    #构造每一页的网址
   reponse=requests.get(url=url,headers=self.headers)
   return reponse.content       #返回响应内容
#解析存储数据
def download_data(self,data,page,date):
   data=data.decode().replace("','')
   html=etree.HTML(data)
   el_list=html.xpath('//*[@id="thumbs"]/section/ul/li/figure/a/@href') #获取一页所有图片封面的链接，存放在列表中
   data_list=[]
   picture_num=0
   for i in el_list:
         picture_num+=1
         image_reponse=requests.get(i).content    #通过链接访问每张图片主页内容
         image_reponse=image_reponse.decode().replace("','')
         html_=etree.HTML(image_reponse)
         url_image=html_.xpath('//*[@id="wallpaper"]/@src')       #获取图片的真实地址
         sleep(0.3)                               #延时防止频繁访问造成服务器反应失败
         image=requests.get(url=url_image,headers=self.headers) #通过图片地址获取图片数据
         with open(f'.\美图库\{date}\{page}_{picture_num}.png','wb+')as f: #以1_2的形式命名并存储图片
            f.write(image.content)
#主程序
def main(self):
   first_page=int(input('输入起始页：'))          #要获取图片的首页页数
   last_page=int(input('输入结束页：'))          #要获取图片的最后一页
   print('玩命爬取中。。。。。。。')
   page=first_page
   today = date.today()       #获取时间，按获取日期给图片分组
   user=os.path.expanduser('~')
   path_save=os.path.join(user,'Desktop')
   os.chdir(path_save)          #将存储路径定位在桌面上
   try:                         #生成文件夹
         os.makedirs(f'美图库\{today}')
   except:
         pass
   try:                      #循环获取每一页图片
         while page<=last_page:
            data=self.gets(page)
            self.download_data(data,page,today)
            page+=1
   except:
         print('出现了一些小问题，请检查网络连接和输入内容')
   else:
         print('任务已完成')                         #输出运行结果

#创建对象
p1=Picture()
#运行主程序
p1.main()

最后，新手上路，如有不足，请多指教！{:301_999:}

fengoto 发表于 2023-7-1 15:20

from lxml import etree

from time import sleep

from datetime import date

import requests

import os

class Picture():

#存放属性

def __init__(self) -> None:



   # self.url='https://wallhaven.cc/search?q=id%3A1&categories=110&purity=100&atleast=3440x1440&sorting=random&order=desc&seed=pisTch&page='

   self.url='https://wallhaven.cc/search?categories=011&purity=110&sorting=date_added&order=desc&ai_art_filter=0&page=' #可根据需求更改

   self.headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'}

#访问目标网站

def gets(self,url_page):

   url=self.url+str(url_page)    #构造每一页的网址

   reponse=requests.get(url=url,headers=self.headers)

   return reponse.content       #返回响应内容

#解析存储数据

def download_data(self,data,page,date):

   data=data.decode().replace("','')

   html=etree.HTML(data)

   el_list=html.xpath('//*[@id="thumbs"]/section/ul/li/figure/a/@href') #获取一页所有图片封面的链接，存放在列表中

   data_list=[]

   picture_num=0

   for i in el_list:

         picture_num+=1

         image_reponse=requests.get(i).content    #通过链接访问每张图片主页内容

         image_reponse=image_reponse.decode().replace("','')

         html_=etree.HTML(image_reponse)

         url_image=html_.xpath('//*[@id="wallpaper"]/@src')       #获取图片的真实地址

         sleep(0.3)                               #延时防止频繁访问造成服务器反应失败

         image=requests.get(url=url_image,headers=self.headers) #通过图片地址获取图片数据

         with open(f'.\美图库\{date}\{page}_{picture_num}.png','wb+')as f: #以1_2的形式命名并存储图片

            f.write(image.content)

#主程序

def main(self):

   first_page=int(input('输入起始页：'))          #要获取图片的首页页数

   last_page=int(input('输入结束页：'))          #要获取图片的最后一页

   print('玩命爬取中。。。。。。。')

   page=first_page

   today = date.today()       #获取时间，按获取日期给图片分组

   user=os.path.expanduser('~')

   path_save=os.path.join(user,'Desktop')

   os.chdir(path_save)          #将存储路径定位在桌面上

   try:                         #生成文件夹

         os.makedirs(f'美图库\{today}')

   except:

         pass

   try:                      #循环获取每一页图片

         while page<=last_page:

            data=self.gets(page)

            self.download_data(data,page,today)

            page+=1

   except:

         print('出现了一些小问题，请检查网络连接和输入内容')

   else:

         print('任务已完成')                         #输出运行结果



#创建对象

p1=Picture()

#运行主程序

p1.main()

lgb446023743 发表于 2023-6-27 14:34

楼主，代码呢。

无际发表于 2023-6-27 14:43

lgb446023743 发表于 2023-6-27 14:34
楼主，代码呢。

第一次发帖有点问题，待会会重新发

xhw1979 发表于 2023-6-27 14:58

{:1_918:}期待楼主更新代码{:1_918:}:victory:

long8586 发表于 2023-6-27 14:58

无际发表于 2023-6-27 14:43
第一次发帖有点问题，待会会重新发

来个打好包的啊{:1_899:}

any_0531 发表于 2023-6-27 17:36

在线等，挺急的。
楼主加油！

无际发表于 2023-6-27 18:39

代码已经打包好了，大家可以领取了{:301_999:}

yyd841122 发表于 2023-6-28 06:38

牛啊，都打包好了，直接用啊！！

yu520 发表于 2023-6-28 08:27

感谢博主分享，这个太好了

无际发表于 2023-6-28 10:39

第一次发帖，不知道代码是怎么放的，排版有点丑，有没有人教教我{:301_999:}

页: [1] 2 3 4 5

吾爱破解 - 52pojie.cn's Archiver

Python批量爬取wallhaven高清壁纸并储存