本帖最后由 ai酸的博文 于 2020-2-13 15:25 编辑
本文仅限于学习交流
一、实现思路
1、爬取中国网http://www.china.com.cn/的所有url链接并且筛选新闻网页url
2、对筛选后的url进行遍历获取新闻标题、内容
二、源码
[Python] 纯文本查看 复制代码 import requests,re,difflib
from bs4 import BeautifulSoup
#爬取网站:中国新闻网
URL = 'http://www.china.com.cn/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
#-------------爬取网页源代码-------------
def _getHtml(url):
response = requests.get(url,headers=headers)
return response.content.decode()
def getHtml(url):
try:
html = _getHtml(url=url)
except:
html= None
return html
#-------------获取筛选后的新闻网页-------------
def getUrl_all(URL):
china_html = getHtml(url=URL)
#获取网页上所有的url,很粗略。
urlList_old = re.findall('<a href="(.*?)"',china_html)
#从上面获取的url进行筛选,以便爬取有效的网站。
modelUrl = 'http://news.china.com.cn/2020-02/12/content_75698735.htm'
urlList_new = difflib.get_close_matches(modelUrl,urlList_old,300,cutoff=0.8)
return urlList_new
#-------------爬取urlList_new的标题、内容-------------
def getInformation(urlList):
num = 0
for url in urlList:
try:
url_html = getHtml(url=url)
soup = BeautifulSoup(url_html,'lxml')
article = soup.find_all(class_='articleTitle')[0].get_text()
print('新闻标题:',article)
content_old = soup.find_all(attrs={'style':'text-indent: 2em; margin-bottom: 15px;'})
content = ''
for i in content_old:
content += i.get_text().strip()
if content=='':
continue
print('新闻内容:\n',content)
num = num +1
print('爬取成功',num,'条...')
except:
pass
def run():
#1、爬取所有网站并且筛选
urlList = getUrl_all(URL=URL)
#2、对筛选网站获取标题、内容
getInformation(urlList=urlList)
if __name__ == '__main__':
run()
三、运行结果图片
对朋友有帮助的话记得免费评分喔~~ |