python桌面壁纸爬取
- 鉴于本人对桌面壁纸的需求(看时间长了容易视觉疲劳),又不想在网上自己一个一个找,于是找了一个图片质量比较高的网站,全部下载下来,方便
本程序只供学习使用,不得用于任何商业用途
运行环境 :
python 3.8.7
win11
(只爬取了每种类型图片的第一页,20张)
# -*- coding = utf-8 -*-
# @Time : 2022/1/16 22:09
# @AuThor : yuluo
# @file : netbian.py
# Software : gvim
import time
import requests
import re
import os
from bs4 import BeautifulSoup
def main():
url = 'https://pic.netbian.com/'
path = '要保存的目录路径'
response = askUrl(url)
url_list = getDataUrlList(response, url)
tupian_html = findEachImagesHtml(url_list, url)
tupian_url = findEachImagesUrl(tupian_html, url)
saveData(tupian_url, path)
def askUrl(url):
# 准备Cookie, User-Agent
proxies = {
"http": None,
"https": None
}
header = {
"Cookie": "cookie ",
"User-Agent": "User-Agent"
}
# 登录
# print("开始发送请求……")
response = requests.get(url=url, headers=header, proxies=proxies)
# 解决乱码
response.encoding = 'GBK'
# print(response.status_code)
return response
def getDataUrlList(response, url):
# print("图片网址开始获取……")
imgList = []
# print(response.text)
soup = str(BeautifulSoup(response.text, 'lxml'))
# 使用正则寻找图片网址后缀
findImageKindUrl = re.compile('a href="/(.*?)" title="4K')
imageKindUrl = findImageKindUrl.findall(soup)
# 对列表去重
imageKindUrl = list(set(imageKindUrl))
# print(imageKindUrl)
# 字符串拼接
for imgUrl in imageKindUrl:
imgList.append(url + imgUrl)
# for i in imgList:
# print(i)
return imgList
def findEachImagesHtml(img_list, url):
# print("开始获取每种图片类型的具体图片url……")
img_url = []
for img_type_url in img_list:
# print(img_type_url)
response = askUrl(img_type_url)
# print(response.text)
findImgUrl = re.compile('f="/tupian/(.*?).html" target')
each_img_type_url_number = findImgUrl.findall(str(BeautifulSoup(response.text, 'lxml')))
# print(each_img_type_url_number)
each_img_type_url = []
for i in each_img_type_url_number:
# print(i)
each_img_type_url.append(url + 'tupian/' + i + '.html')
# print(each_img_type_url)
# 进行列表之间的拼接
img_url = img_url + each_img_type_url
# for i in img_url:
# print(i)
return img_url
def findEachImagesUrl(tupian_html, url):
# 定义一个存放图片网址的集合
pic_url = []
for i in tupian_html:
# print(askUrl(i).text)
findPicUrl = re.compile('img src="/uploads/allimg/(.*?).jpg" data-pic')
picUrl_number = findPicUrl.findall(str(askUrl(i).text))
# print(picUrl_number)
# 网址拼接
for j in picUrl_number:
# print(i)
pic_url.append(url + 'uploads/allimg/' + j + '.jpg')
# print(pic_url)
# for i in pic_url:
# print(i)
return pic_url
def saveData(tupian_url, path):
makeFolder(path)
# 计数器
count = 0
for img_url in tupian_url:
img_name = img_url.split("/")[-1] # 拿到url中的最后一个/后面的内容
with open(path + img_name, mode="wb") as f:
f.write(askUrl(img_url).content) # 图片内容写入到文件
count = count + 1
# 睡一秒,防止服务器发现
time.sleep(1)
print("下载数量:%d" % count)
def makeFolder(path):
folder = os.path.exists(path)
if not folder:
os.makedirs(path)
print("目录创建成功!")
else:
print("目录已经存在!")
if __name__ == '__main__':
begin_time = time.time()
print("下载开始……,开始时间:%f" % begin_time)
main()
end_time = time.time()
print("下载完成,结束时间 %f" % end_time)
run_time = end_time - begin_time
print(run_time)
|