本教程请使用Python语言,我们一共分为4部分,简简单单完成网页数据抓爬
[Python] 纯文本查看 复制代码 #导入requests库和re库
import requests
import re
#爬取网页源代码
url = 'https://movie.douban.com/chart'
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
}
response = requests.get(url,headers=headers)
html_str = response.text
#网页源代码解析
pattern = re.compile('class="nbg".*?title="(.*?)"')
results = re.findall(pattern,html_str)
#保存数据到本地
with open('douban.txt','w',encoding='utf-8') as f:
for r in results:
f.write(r+'\n') |