东方财富帖子爬取—以上证指数吧为例
概述
最近在学习文本情感分析, 找了很多作为案例的论坛贴吧作为爬取对象
本次的对象是东方财富的股吧
代码
import requests
from lxml import etree
from time import sleep
import random
from fake_useragent import UserAgent
ua = UserAgent()
for p in range(4710):
try:
url = f'http://guba.eastmoney.com/list,zssh000001_{p + 1}.html'
headers = {'User-Agent': ua.random}
page = requests.get(url=url, headers=headers).content.decode('utf-8')
tree = etree.HTML(page)
for li in tree.xpath('//div[@id="articlelistnew"]/div')[1: -2]:
try:
popper = li.xpath('./span[@class="l4 a4"]/a/@data-popper')[0]
if popper not in ['3006113720930996', '3006113720930996', '7428111481466798', '6712111507146464', '6255325874333310']:
read = li.xpath('./span[@class="l1 a1"]/text()')[0]
com = li.xpath('./span[@class="l2 a2"]/text()')[0]
title = li.xpath('./span[@class="l3 a3"]/a/text()')[0]
date = li.xpath('./span[@class="l5 a5"]/text()')[0]
row = f'{read},{com},{title},{date}'
print(row)
with open('上证指数吧.csv', 'a', encoding='utf-8-sig') as f:
f.write(row + '\n')
except Exception as e:
print(str(e))
except Exception as e:
print(str(e))
finally:
print(str(p))
sleep(0.1 + random.random() / 10)
运行状况
运行结果
|