爬取舜网时评铁路栏目的文章,并存入csv文件
本人python爱好者&小白一枚,工作需要有时会向舜网时评网站的铁路栏目(http://opinion.e23.cn/index.php?m=content&c=index&a=lists&catid=25)投稿。某日突发奇想,何不把这个栏目所有文章爬下来存入数据库,以便进行进一步分析掌握时评文章的流行趋势呢。于是搬出好久没用过的python,一点一点开始搭建爬虫。第一次搭建,选择从易到难,先爬取日期、作者、题目、文章链接,至于文章正文,那是2.0版的工作。
舜网爬虫1.0:
import requests
from bs4 import BeautifulSoup
import lxml
import time
import csv
#当前日期
date = time.strftime("%Y-%m-%d", time.localtime())
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
#建立csv文件,建立date、author、title、link栏
f = open('opinions.csv' , 'w' , newline='' , encoding = 'utf-8')
fwriter = csv.writer(f)
fwriter.writerow(['date' , 'author' , 'title' , 'link' , 'contents'])
f.close()
f = open('opinions.csv' , 'a' , newline='' , encoding = 'utf-8')
fwriter = csv.writer(f)
#初始页面为1
pages =1
while (pages < 2500):
param = {'m':'content' , 'c':'index' , 'a':'lists' , 'catid':25 , 'page':pages}
r = requests.get('http://opinion.e23.cn/index.php', params=param , headers = header)
soup = BeautifulSoup(r.text, 'lxml')
articles = soup.find_all('div' , 'CrBox03ConTopTxt' )
for titles in articles:
fdate = titles.span.contents.split(' ')
fauthor = str(titles.span.contents).split(' ')[-1]
ftitle = str(titles.a.contents)
flink = titles.a.get('href')
try:
fwriter.writerow()
except Exception as e:
print('Error')
pass
pages +=1
f.close()
第一步做好了,再研究如何爬取正文,先写了一个单独爬一篇正文的小程序做研究:
import requests
from bs4 import BeautifulSoup
flink = 'http://opinion.e23.cn/a/2020-05-28/177798'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
r = requests.get(flink , headers =header)
soup = BeautifulSoup(r.text, 'lxml')
content = soup.find('div' , 'box_con')
print(content.text.strip())
研究明白之后,考虑把1.0版本生成的csv文件添加一列文章正文,于是就有了批量写入正文的小程序:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
data = pd.read_csv('opinions.csv', index_col=False)
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
for link in data['link']:
#print(link)
try:
r = requests.get(link , headers =header)
soup = BeautifulSoup(r.text, 'lxml')
article = soup.find('div' , 'box_con')
data['contents'] = article.text
except Exception as e:
pass
data.to_csv('opinions_with_contents.csv', index = False , encoding = 'utf-8')
为了一劳永逸的解决问题,接下来该写程序的2.0版本了,一次性爬取所有信息,写入文件。
import requests
from bs4 import BeautifulSoup
import lxml
import time
import csv
from pandas import read_csv
import os
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'}
f = open('opinions_with_contents.csv' , 'a' , newline='' , encoding = 'utf-8')
fwriter = csv.writer(f)
#初始页面为1
pages =1
while (pages < 8):
#构造链接
param = {'m':'content' , 'c':'index' , 'a':'lists' , 'catid':25 , 'page':pages}
r = requests.get('http://opinion.e23.cn/index.php', params=param , headers = header)
soup = BeautifulSoup(r.text, 'lxml')
articles = soup.find_all('div' , 'CrBox03ConTopTxt' )
for titles in articles:
try:
#获取文章日期、作者、标题和链接
fdate = titles.span.contents.split(' ')
fauthor = str(titles.span.contents).split(' ')[-1]
ftitle = str(titles.a.contents)
flink = titles.a.get('href')
#根据链接获取正文
linkr = requests.get(flink , headers =header)
linksoup = BeautifulSoup(linkr.text, 'lxml')
#写入文件
content = linksoup.find('div' , 'box_con').text.strip()
fwriter.writerow()
except Exception as e:
print('Error:' + flink)
pass
pages +=1
f.close()
#删除重复项,写入新文件
df = read_csv('opinions_with_contents.csv')
newDF = df.drop_duplicates();
newDF.to_csv('opinions_with_contents2.csv' , encoding='utf8' , index = False)
#重命名新文件
os.remove('opinions_with_contents.csv')
os.rename('opinions_with_contents2.csv' , 'opinions_with_contents.csv')
至于3.0版本,可以在csv文件的基础上,利用现有包做点语言分析,甚至用GPT-chinese项目自动写文章投稿。
新手小白第一次发帖,一点小小的爱好,请各位大佬多指教。
山东老乡不错加油 谢谢分享。好思路,看看能否运行成功。 运行成功了,回头换个网站看看行不行 hshcompass 发表于 2020-6-14 16:22
运行成功了,回头换个网站看看行不行
换个网站有些细节就需要调整一下了
页:
[1]