利用Python爬虫获取知乎热榜
# 利用Python爬虫获取知乎热榜> 如今,知乎必须要登录才能查看相关话题内容,给我们的日常造成了极大的不便,今天我就教大家如何利用简单的代码,绕开知乎登录限制。
## 准备工作
1. 配置好python运行环境,推荐 pycharm。
2. 复制下面的源代码,运行,大功告成。
## 源代码
```python
import requests
class Zhihu:
"""
知乎热榜
"""
def __init__(self):
self.hot_lists_api = 'https://api.zhihu.com/topstory/hot-lists/total'# 热榜api
self.recommend_lists_api = 'https://api.zhihu.com/topstory/recommend'# 推荐api
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
self.hot = self.get_hot_lists()# 热榜未处理数据
self.recommend = self.get_recommend_lists()# 推荐未处理数据
self.hot_data = self.wash_hot_lists()# 热榜处理后数据
self.recommend_data = self.wash_recommend_lists()# 推荐处理后数据
def get_hot_lists(self):
"""
获取知乎热榜
:return: json
"""
params = {'limit': '10',
'is_browser_model': '0'}
response = requests.get(url=self.hot_lists_api, headers=self.headers, params=params)
return response.json()
def get_recommend_lists(self):
"""
获取随机推荐
:return:
"""
params = {
"action": "down",
"ad_interval": "-10",
"after_id": '1',# TODO:
"page_number": '1',# TODO:
"session_token": "99872c210b53364be1ede4bf459e8005", }
response = requests.get(url=self.recommend_lists_api, headers=self.headers, params=params)
return response.json()
def wash_hot_lists(self):
"""
清洗热榜数据
:return:['(url)',....]
"""
hot_lists = []
for data in self.hot['data']:
title = data['target']['title']
url = data['target']['url'].replace('api.zhihu.com/questions', 'zhihu.com/question')
hot_lists.append(f'[{title}]({url})')
return hot_lists
def wash_recommend_lists(self):
"""
清洗推荐数据
:return:
"""
hot_lists = []
for data in self.recommend['data']:
try:
title = data['target']['question']['title']
url = data['target']['question']['url'].replace('api.zhihu.com/questions', 'zhihu.com/question')
except KeyError:
title = data['target']['title']
url = data['target']['url'].replace('api.zhihu.com/questions', 'zhihu.com/question')
hot_lists.append(f'[{title}]({url})')
return hot_lists
zhihu = Zhihu()
```
## 使用教程
- 要获取当前知乎热榜数据,在源代码末尾添加下面这行代码,然后运行程序即可。
```python
print(zhihu.hot_data)
```
- 要想获取随机推荐话题,在源代码末尾添加下面这行代码,然后运行程序即可。
```python
print(zhihu.recommend_data)
```
## 运行结果展示
import requests
class Zhihu:
"""
知乎热榜
"""
def __init__(self):
self.hot_lists_api = 'https://api.zhihu.com/topstory/hot-lists/total'# 热榜api
self.recommend_lists_api = 'https://api.zhihu.com/topstory/recommend'# 推荐api
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
self.hot = self.get_hot_lists()# 热榜未处理数据
self.recommend = self.get_recommend_lists()# 推荐未处理数据
self.hot_data = self.wash_hot_lists()# 热榜处理后数据
self.recommend_data = self.wash_recommend_lists()# 推荐处理后数据
def get_hot_lists(self):
"""
获取知乎热榜
:return: JSON
"""
params = {'limit': '10', 'is_browser_model': '0'}
response = requests.get(url=self.hot_lists_api, headers=self.headers, params=params)
return response.json()
def get_recommend_lists(self):
"""
获取随机推荐
:return: JSON
"""
params = {
"action": "down",
"ad_interval": "-10",
"after_id": '1',# TODO: 设置适当的值
"page_number": '1',# TODO: 设置适当的值
"session_token": "99872c210b53364be1ede4bf459e8005"
}
response = requests.get(url=self.recommend_lists_api, headers=self.headers, params=params)
return response.json()
def wash_hot_lists(self):
"""
清洗热榜数据
:return: ['(url)',....]
"""
hot_lists = []
for data in self.hot.get('data', []):
target = data.get('target', {})
title = target.get('title', '')
url = target.get('url', '').replace('api.zhihu.com/questions', 'zhihu.com/question')
hot_lists.append(f'[{title}]({url})')
return hot_lists
def wash_recommend_lists(self):
"""
清洗推荐数据
:return: ['(url)',....]
"""
hot_lists = []
for data in self.recommend.get('data', []):
target = data.get('target', {})
try:
question = target.get('question', {})
title = question.get('title', '')
url = question.get('url', '').replace('api.zhihu.com/questions', 'zhihu.com/question')
except KeyError:
title = target.get('title', '')
url = target.get('url', '').replace('api.zhihu.com/questions', 'zhihu.com/question')
hot_lists.append(f'[{title}]({url})')
return hot_lists
# 创建对象
zhihu = Zhihu()
有bug,偶发性的,遇到了会报错。推荐里面,不是只有question和无question两种状态,还有没有title和url这种状态。如下图:
有空多多交流{:1_893:} 先保存一下 万一下次用得着 感谢分享{:1_893:}收藏一下 感谢分享,学习下 学习了,做成类,方便调用,不错的写法。 xinxiu 发表于 2023-8-15 19:27
学习了,做成类,方便调用,不错的写法。
还有B站、微博、百度、抖音、财经我都做了 zz443470785 发表于 2023-8-15 19:53
还有B站、微博、百度、抖音、财经我都做了
不错,有API接口研究一下应该不难。
我只爬了今日头条。:lol xinxiu 发表于 2023-8-15 20:08
不错,有API接口研究一下应该不难。
我只爬了今日头条。
是的,我都是用手机app抓包,拿到了api接口 感谢分享,是个实用的功能 感谢分享,是个实用的功能,试试手