Python爬虫爬取壁纸
本帖最后由 szsnk144864 于 2022-5-6 23:13 编辑没有太多的Python基础,看了一点爬虫教程,在52里请教大家也得到来了大家的帮助,感谢,
写了一个爬取一个壁纸网站的小爬虫,
虽然能够正常爬取下载,但是并不是很完美,希望能得到大佬们的指点,谢谢
import requests
from lxml import etree
url = "https://www.mmonly.cc/gqbz/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
# 获取响应
resp = requests.get(url=url, headers=headers)
resp.encoding = 'gbk'
tree1 = etree.HTML(resp.text)
# 获取指定页面所有的详细链接地址,可以通过独一无二的id来找,会简单一点
all_img_end_url = url + (tree1.xpath('//*[@id="pageNum"]/a/@href')[-1])# 通过拼接得到地址
response1 = requests.get(url=all_img_end_url, headers=headers)
response1.encoding = 'gbk'
tree2 = etree.HTML(response1.text)
get_end_num = tree2.xpath('//*[@id="pageNum"]/a/text()')[-2]# 通过拼接地址来回去
# 第一次循环:通过for循环来获取所有的页面
for s1 in range(1, int(get_end_num) + 1):
if s1 == 1:# 因为页面的不同,在第一页的时候,不需要在后面加编号
all_top_inside_url = 'https://www.mmonly.cc/gqbz/'
else:
all_top_inside_url = f'https://www.mmonly.cc/gqbz/list_41_{s1}.html'
# 获取所有页面,并交给xpath
response2 = requests.get(url=all_top_inside_url, headers=headers)
response2.encoding = 'gbk'
tree3 = etree.HTML(response2.text)
u_img_dizhi = tree3.xpath('//*[@id="infinite_scroll"]/div/div/div/div/a/@href')# 获取每一个页面的图片地址
# 第二次循环:进入到每一个页面里,
for a in u_img_dizhi:# a 是每一个页面的地址
response3 = requests.get(url=a, headers=headers)
response3.encoding = 'gbk'
tree4 = etree.HTML(response3.text)
# 这样就能获取到总页数了
all_inside_apge = tree4.xpath('//*[@id="picnum"]/span/text()')[-1]
# 获取页面是在什么分类
get_fenlei = tree4.xpath('/html/body/div/div/div/a/text()')[-1]
# 获取第二页的短连接
get_DuanLianJie = ((a.split('/')).split('.'))
# 获取名字
get_name = tree4.xpath('/html/body/div/div/div/h1/text()')
print('正在爬取:' + (''.join(get_name)) + '共计:' + (''.join(all_inside_apge)) + '页')
# 通道字符串来制作网址中间的衔接
if get_fenlei == '文字壁纸':
photo_fenlei = 'wzbz'
if get_fenlei == '动态壁纸':
photo_fenlei = 'dtbz'
if get_fenlei == '美女壁纸':
photo_fenlei = 'mnbz'
if get_fenlei == '风景壁纸':
photo_fenlei = 'fjbz'
if get_fenlei == '动漫壁纸':
photo_fenlei = 'dmbz'
if get_fenlei == '唯美壁纸':
photo_fenlei = 'wmbz'
if get_fenlei == '纯色壁纸':
photo_fenlei = 'csbz'
# 第三次循环:将所有的图片都给找到地址
all_url = []
for s in range(1, (int(all_inside_apge) + 1)):# 从第一个页面开始,但是它检索的少一页,需要手动加上
if s == 1:# 因为页面的不同,在第一页的时候,不需要加编号
all_inside_url = 'https://www.mmonly.cc/gqbz/' + str(photo_fenlei) + '/' + get_DuanLianJie + ('.html')
else:
all_inside_url = f'https://www.mmonly.cc/gqbz/' + str(photo_fenlei) + '/' + get_DuanLianJie + '_' + str(
s) + '.html'
all_url.append(all_inside_url)
#第四次循环:获取所有的图片地址
for k in all_url:
response4 = requests.get(url=k, headers=headers)
response4.encoding = 'gbk'
tree5 = etree.HTML(response4.text)
# 我也不知道为啥这么做,只知道这样之后,能把列表变成字符串
all_inside_img = ''.join(tree5.xpath('//*[@id="big-pic"]/p//img/@src'))
# 这里懵住了,想不到应该怎么写了...
print('正在保存第:' + k + '页')
with open(f'd:/test/' + (get_DuanLianJie + '_' + str(s) + '.jpg'), 'wb') as f:
f.write(requests.get(all_inside_img).content) # 这里不知道为什么这么写,模仿别人写的
# print(get_name + '下载完成')
# 我也不知道要不要写这么多 response.close,我就是害怕网站封我ip.......
response4.close() # 这是第四次循环的response
response3.close() # 这是第三次循环的response
response2.close() # 这是第二次循环的response
response1.close() # 这是第一次循环的response
resp.close() # 这是最开始的response
本帖最后由 话痨司机啊 于 2022-5-9 15:48 编辑
下面的代码没测试,测试完有点问题,【去这里看】https://www.52pojie.cn/thread-1633562-1-1.html(测试完了,没啥问题,稍微改了点,有大佬可以指点下,我觉得写的还是有点那啥)
# //div[@class="item_list infinite_scroll masonry"]/div//a/img/@alt 壁纸标题
# //div[@class="item_list infinite_scroll masonry"]/div//a/img/@src 壁纸图片地址
# 总页数440页 https://www.mmonly.cc/gqbz/list_41_.html 页数page_num从1开始到440
# //div[@class="topmbx"]/a/text() 壁纸类型
# //div[@id="big-pic"]//a/@href 壁纸高清地址
import aiofiles
import aiohttp
import asyncio
import async_timeout
from collections import namedtuple
import re
import time
import os
from rich.console import Console
from fake_useragent import UserAgent
from lxml import etree
from typing import NamedTuple,List,Text
console = Console()
headers = UserAgent().random
Img_url_name = namedtuple('Img_url_name', ['img_url', 'img_name'])
Img_big_url_type = namedtuple('Img_big_url_name', ['img_big_url', 'img_type'])
async def get_html(url) -> Text:
"""
获取网页源码
"""
async with aiohttp.ClientSession() as session:
async with async_timeout.timeout(10):
async with session.get(url,headers=headers) as resp:
return await resp.text()
async def save_img(img_url, img_name) -> None:
"""
保存图片
"""
async with aiohttp.ClientSession() as session:
async with async_timeout.timeout(10):
async with session.get(img_url,headers=headers) as resp:
img = await resp.read()
async with aiofiles.open(img_name, 'wb') as f:
await f.write(img)
console.print(f'{img_name} 下载完成!')
def get_img_url_name(resp_text) -> List:
"""
获取缩略图页面的图片地址和图片名称
"""
tree = etree.HTML(resp_text)
# 每页有24张缩略图和网址
img_url_name = /div{num}//a/img/@src'),
img_name = tree.xpath(f'//div[@class="item_list infinite_scroll masonry"]/div{num}//a/img/@alt')) for num in range(1,25)]
return img_url_name
def get_big_img(resp_text) -> Img_big_url_type:
"""
获取详情页的高清图片地址和图片类型
"""
tree = etree.HTML(resp_text)
img_big_url_type = Img_big_url_type(img_big_url=tree.xpath('//div[@id="big-pic"]//a/@href'),img_type=tree.xpath('//div[@class="topmbx"]/a/text()'))
return img_big_url_type
def mkdir(path) -> bool:
"""
创建文件夹
"""
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
async def main():
"""
主函数
"""
start_time = time.time()
with asyncio.Semaphore(5):
for num in range(1,441):
# 共计440页
url = f' https://www.mmonly.cc/gqbz/list_41_{num}.html'
resp_text = await get_html(url)
get_img_url_name_list = get_img_url_name(resp_text)
for img_url_name in get_img_url_name_list:
resp_text_big_img = await get_html(img_url_name.img_url)
# 图片标号开始为1
img_num = 1
# 图片页码开始为1
page_num = 1
while True:
try:
if page_num >= 2:
resp_text_big_img = await get_html(next_img_big_url)
img_big_url_type = get_big_img(resp_text_big_img)
if mkdir(os.path.join(img_big_url_type.img_type,img_url_name.img_name)):
await save_img(img_big_url_type.img_big_url, f'{img_url_name.img_name}_{img_num}.jpg')
img_num += 1
await asyncio.sleep(1)
page_num += 1
next_img_big_url = ".".join(img_url_name.img_url.split(".")) + "_" + page_num + ".html"
except:
img_num = 1
break
console.print(f'下载完成! 耗时{time.time() - start_time}秒')
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
留个记号 话痨司机啊 发表于 2022-5-7 02:15
# //div[@class="item_list infinite_scroll masonry"]/div//a/img/@alt...
谢谢大佬的指点,我来研究一下,我刚学爬虫,不怎么看的明白这个{:1_893:} tl;dr 发表于 2022-5-7 06:11
必应的历史壁纸
啊这,没想到啊,我就随便找的{:1_918:} 只要解决问题就好啊 多谢分享,Python学习中。 感谢分享 网址一改是不是就可以爬其他站了
页:
[1]
2