本帖最后由 ijlsqt 于 2021-5-6 18:05 编辑
最近学习爬虫,简单写了个爬取湖南日报文字的demo。
只写到了爬取内容,有需要持久化的,可以自己修改存文件或存数据库。
菜鸟一枚,请勿喷。
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url_pre = 'https://hnrb.voc.com.cn/hnrb_epaper/html/2021-04/30/' # 爬取日期请在此处修改
url = url_pre + 'node_201.htm'
main_resp = requests.get(url, headers=headers)
main_resp.encoding = 'utf-8'
main_soup = BeautifulSoup(main_resp.text, 'html.parser')
banmian_links = main_soup.find('div', {'id': 'bmdh'}).find_all('a', {'id': 'pageLink'})
for banmian_link in banmian_links:
href = banmian_link.get('href')
child_href = url_pre + href
# print(child_href)
child_resp = requests.get(child_href, headers=headers)
child_resp.encoding = 'utf-8'
child_soup = BeautifulSoup(child_resp.text, 'html.parser')
content_links_temp = child_soup.find('ul', {'class': 'ul02_l'})
# print(content_links_temp)
content_links = content_links_temp.find_all('a')
# print(content_links)
for content_link in content_links:
# print(url_pre + content_link.get('href'))
content_resp = requests.get(url_pre + content_link.get('href'), headers=headers)
content_resp.encoding = 'utf-8'
# print(content_resp.text)
content_temp = BeautifulSoup(content_resp.text, 'html.parser')
title = content_temp.find('td', {'class': 'font01'}).text
content = content_temp.find('div', {'id': 'ozoom'}).text
print(title)
print(content) |