本帖最后由 Hswyc 于 2022-3-28 23:02 编辑
最近浏览论坛,看到了帖子:https://www.52pojie.cn/thread-1404328-1-1.html
帖子上爬取的是https://www.vmgirls.com/这个网站的图片
最近在学习爬虫,准备拿这个练练手
https://www.vmgirls.com/archives.html 这里有所有文章链接的入口,所以就从这里出发了
源代码:一共是两种,一个用到了异步协程,下载会快一点
[Python] 纯文本查看 复制代码 import requests
import re
import os
import time
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.52 '
}
# 在文章归档页面,获取每个分类下的链接
def get_url(url):
resp = requests.get(url, headers=headers)
# 这里很奇怪,F12看到的网页源代码和resp.text获得源代码不一样
pattern = '<a target=_blank style="color:.*?" href=(.*?)>'
# url_lst存放所有文章的链接
arch_url_lst = re.findall(pattern, resp.text)
arch_urls_new = []
url_head = 'https://www.vmgirls.com/'
for url in arch_url_lst:
url = url_head + url
arch_urls_new.append(url)
resp.close()
return arch_urls_new
# 解析
def parse_html(arch_url):
resp = requests.get(arch_url, headers=headers)
print(arch_url + ' ' + str(resp.status_code))
html = resp.text
resp.close()
title_pattern = '<h1 class=.*?>(.*)</h1>'
img_pattern = '<a href="(.*?)" alt=.*?</a>'
title = re.findall(title_pattern, html)
img_url_lst = re.findall(img_pattern, html)
url_head = 'https:'
img_url_lst_new = []
for url in img_url_lst:
url = url_head + url
img_url_lst_new.append(url)
return title[0], img_url_lst_new
# 下载
def download_img(img_url, title):
resp = requests.get(img_url, headers=headers)
content = resp.content
resp.close()
dir_name = title
img_name = img_url.split('/')[-1]
if not os.path.exists(f'../data/meimei/{dir_name}'):
os.mkdir(f'../data/meimei/{dir_name}')
print(dir_name + '-->' + img_name + '正在下载...')
with open(f'../data/meimei/{dir_name}/{img_name}', 'wb') as f:
f.write(content)
print(dir_name + '-->' + img_name + '下载完成!')
# 主函数
def main():
url = 'https://www.vmgirls.com/archives.html'
arch_urls = get_url(url)
n = 0
num = int(input('输入要下载多少篇文章的图片:'))
for arch_url in arch_urls:
n += 1
if n == num + 1:
break
arch_title, img_urls = parse_html(arch_url)
for img_url in img_urls:
download_img(img_url, arch_title)
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print(f'下载完成,用时: {end - start}s')
[Python] 纯文本查看 复制代码 import asyncio
import random
import aiofile
import aiohttp
import requests
import re
import os
import time
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.52 '
}
# 在文章归档页面,获取每个分类下的链接
def get_url(url):
resp = requests.get(url, headers=headers)
# 这里很奇怪,F12看到的网页源代码和resp.text获得源代码不一样
pattern = '<a target=_blank style="color:.*?" href=(.*?)>'
# arch_urls存放所有文章的链接
arch_urls = re.findall(pattern, resp.text)
arch_urls_new = []
url_head = 'https://www.vmgirls.com/'
# 补全链接
for url in arch_urls:
url = url_head + url
arch_urls_new.append(url)
resp.close()
return arch_urls_new
# 解析
async def download_img(arch_url):
# 随机暂停1-2秒
time.sleep(random.randint(1, 2))
async with aiohttp.ClientSession() as session:
async with session.get(arch_url, headers=headers) as resp:
html = await resp.text()
title_pattern = '<h1 class=.*?>(.*)</h1>'
img_pattern = '<a href="(.*?)" alt=.*?</a>'
arch_title = re.findall(title_pattern, html)
img_url_lst = re.findall(img_pattern, html)
url_head = 'https:'
# 补全图片链接
for one_url in img_url_lst:
new_url = url_head + one_url
async with session.get(new_url, headers=headers) as resp_2:
content = await resp_2.content.read()
dir_name = arch_title[0]
img_name = new_url.split('/')[-1]
if not os.path.exists(f'../data/meimei/{dir_name}'):
os.mkdir(f'../data/meimei/{dir_name}')
print(dir_name + '-->' + img_name + '正在下载...')
async with aiofile.async_open(f'../data/meimei/{dir_name}/{img_name}', 'wb') as f:
await f.write(content)
print(dir_name + '-->' + img_name + '下载完成!')
# 主程序
async def main():
url = 'https://www.vmgirls.com/archives.html'
arch_urls = get_url(url)
tasks = []
n = 0
num = int(input('输入要下载多少篇文章的图片:'))
for arch_url in arch_urls:
n += 1
if n == num + 1:
break
tasks.append(asyncio.ensure_future(download_img(arch_url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end = time.time()
print(f'下载完成,用时: {end - start}s')
速度对比,都是下载9篇文章的图片
|