简单的爬虫写法,推荐小白来学习[Python] 纯文本查看 复制代码 import requests
import re
import os
import time
#创建文件夹
def file_folder():
# 创建mydata文件夹
# 如果mydata文件夹已存在,清空文件夹(先清空后删除再创建)
pathd = os.getcwd() + '\\彼岸花4k壁纸'
if os.path.exists(pathd): # 判断mydata文件夹是否存在
for root, dirs, files in os.walk(pathd, topdown=False):
for name in files:
os.remove(os.path.join(root, name)) # 删除文件
for name in dirs:
os.rmdir(os.path.join(root, name)) # 删除文件夹
os.rmdir(pathd) # 删除mydata文件夹
os.mkdir(pathd) # 创建mydata文件夹
count_1=1
def data(url):
response=requests.get(url)
response.encoding='gbk'
response = response.text
# print(response)
url_r = '<li><a href="(.*?)" target="_blank"><img.*?><b>.*?</b></a></li>'
url_name=re.findall(url_r,response)
del url_name[0]
for i in url_name:
global count_1
str='https://pic.netbian.com'+i
ress = requests.get(str)
ress.encoding="gbk"
response=ress.text
r='<div class="photo-pic"><a href="" id="img"><img src="(.*?)".*?></a></div>'
url_list=re.findall(r,response)
if len(url_list)==0:
continue
sts='https://pic.netbian.com'+url_list[0]
res =requests.get(sts)
with open(f"{os.getcwd()}\\彼岸花4k壁纸\\{count_1}.jpg", "wb") as f:
f.write(res.content)
count_1+=1
time.sleep(1.5)
if __name__ =="__main__":
#创建文件夹images
url=input('图片网站:https://pic.netbian.com\n文件保存在彼岸花4k壁纸文件夹里\n第一页不可以爬\n请输入爬取的链接\n(提示:网址链接写法\n如:https://pic.netbian.com/4kbeijing/index_斜杠后不要加后面内容):')
pagestart=input('请输入开始页数:')
pagestop = input('请输入结束页数:')
file_folder()
for i in range(int(pagestart),int(pagestop)):
st=url+str(i)+'.html'
print(f'开始爬取第{i}页')
data(st)
print('第',i,'页爬取完毕,如果文件夹不加载图片说明网址输入错误') |