本人学习的是大气科学,最近在学天气学,需要分析中央气象台的天气图,所以做了个爬虫爬取。
(网站上的图片只保留最近几天,所以可以自己爬取下来留存)
示例
[Python] 纯文本查看 复制代码 import os
import re
import requests
from bs4 import BeautifulSoup
# url='http://www.nmc.cn/publish/observations/asia/dm/weatherchart-h000.htm'
l1=['china','asia','north']
l2=['000','925','850','700','500','200','100']
l3=['weatherchart','cloud']
l4=['weatherchart','radar']
l5=['weatherchart']
urls=[]
names=[]
# url='http://www.nmc.cn/publish/observations/'+l1[i]+'/dm/'+l3[k]+'-'+'h'+l2[j]+'.htm'
for i in range(3):
for j in range(len(l2)):
if i==0 and j==0:
url='http://www.nmc.cn/publish/observations/china/dm/radar-h000.htm'
urls.append(url)
name='china000cloud'
names.append(name)
url='http://www.nmc.cn/publish/observations/china/dm/weatherchart-h000.htm'
urls.append(url)
name='china000weatherchart'
names.append(name)
elif i==2:
url='http://www.nmc.cn/publish/observations/'+l1[i]+'/dm/'+'weatherchart-'+'h'+l2[j]+'.htm'
urls.append(url)
name=l1[i]+'weatherchart'+l2[j]
names.append(name)
else:
for k in range(2):
url='http://www.nmc.cn/publish/observations/'+l1[i]+'/dm/'+l3[k]+'-'+'h'+l2[j]+'.htm'
urls.append(url)
name=l1[i]+l2[j]+l3[k]
names.append(name)
# print(urls)
for i in range(len(urls)):
url=urls[i]
name=names[i]
#2、更改爬虫头部信息
kv = {'user-agent':'Mozilla/5.0'}
r=requests.get(url,headers=kv)
#更改编码方式
r.encoding=r.apparent_encoding
demo= r.text
soup = BeautifulSoup(demo,"html.parser")
#页面的title
# print(soup.prettify()[54000:58000])
r=soup.find_all(id='timeWrap')[0]
#删去第一个,共41组数据
plist = re.split('data-img',str(r))[1:]
nlist=re.split('data-time',str(r))[1:]
n=nlist[0].split('"')[1].split()
for i in range(len(plist)):
p=plist[i].split()[0][2:-2]
# print(p)
n=nlist[i].split('"')[1].replace('/', '.').replace(' ','.').replace(':00','')
path = 'D:/tianqitu/'+name+'/'+name+n+'.jpg'
print(path)
url=p
z=requests.get(url)
with open(path, 'wb') as f:
f.write(z.content)
#关闭文件
f.close()
print(name+n) |