知乎问答下载为docx
介绍浏览知乎时感觉一个问题的回答不断刷新很麻烦,于是写了一个批量爬取。
网盘链接:链接: https://pan.baidu.com/s/11JkNWu1kq-A1e7XfMitu7A?pwd=2dhs 提取码: 2dhs 复制这段内容后打开百度网盘手机App,操作更方便哦
注意
输入的链接形式必须是直达问题,而不是某一回答。
同时由于获取下一页回答的链接在上一页中,因此使用单线程递归获取。
在保存数据的时候,设置docx中的一些格式未能生效,原因不知。
代码块
import os
import time
import requests
import re
import json
import logging
from functools import wraps
import docx
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Inches
# 更改最大递归次数
import sys
sys.setrecursionlimit(3000)
def retry(exception=Exception, tries=3, delay=1, logger=logging):
'''
重试装饰器
:param exception: 异常类型
:param tries: 重试次数
:param delay: 重试间隔
:param logger: 日志对象
:return:
'''
def decorator(f):
@wraps(f)
def wrapper(*args, **kwargs):
_tries = tries
while _tries > 1:
try:
return f(*args, **kwargs)
except exception as e:
_tries -= 1
if logger:
logger.error(e)
time.sleep(delay)
return f(*args, **kwargs)
return wrapper
return decorator
class ZhiHu:
def __init__(self, url):
self.headers = {
'Host': 'www.zhihu.com',
'User-Agent': 'Mozilla/5.0'
}
self.url = url
self.answers = []
def get_first(self):
res = requests.get(self.url, headers=self.headers)
# 获取下一页链接
next = re.findall(r'"next":"(http.+?)",', res.text).encode('utf-8').decode("unicode_escape")
# 解析数据
data = re.findall(r'<script id="js-initialData" type="text/json">(.+?)</script>', res.text)
data = json.loads(data)['initialState']['entities']
# 获取问题信息
question = list(data['questions'].values())
title = question['title']
question_url = question['url'].replace('/api/v4/questions/', '/question/')
answer_count = question['answerCount']
author = question['author']['name']
author_url = question['author']['url'].replace('/api/v4', '')
self.questions = {
'title': title,
'url': question_url,
'answerCount': answer_count,
'author': author,
'authorUrl': author_url,
}
# 获取第一页回答
answer = data['answers']
for ans in answer:
name = answer['author']['name']
name_url = answer['author']['url'].replace('/api/v4', '')
content = answer['content']
content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)
text = re.findall(r'<p.*?>(.*?)</p.*?>', content)
text = '\n'.join(text).replace('<br/>', '\n')
text = text.replace('"', '"')
imgs = re.findall(r'src="(http.+?)"', content)
self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
return next
@retry()
def get_later(self, url):
headers = self.headers.copy()
headers['Referer'] = self.url
res = requests.get(url, headers=headers).json()
# 解析基本信息
is_end = res['paging']['is_end']
next = res['paging']['next']
page = res['paging']['page']
# 解析回答数据
datas = res['data']
for data in datas:
name = data['target']['author']['name']
name_url = data['target']['author']['url'].replace('/api/v4', '')
content = data['target']['content']
content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)
text = re.findall(r'<p.*?>(.*?)</p.*?>', content)
text = '\n'.join(text).replace('<br/>', '\n')
imgs = re.findall(r'src="(http.+?)"', content)
self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
print(f'\r已获取到{page}页。', end='')
if is_end or page == self.num:
return
time.sleep(0.2)
self.get_later(next)
def main(self):
next = self.get_first()
print('\n已获取到第1页。')
print(f'\n共有{self.questions["answerCount"]}条回答。')
self.num = int(input('\n想要获取多少页(每页5条回答,输入0获取全部):'))
if self.num != 1:
self.get_later(next)
return {'question': self.questions, 'answer': self.answers}
# docx文本加超链接
def add_hyperlink(paragraph, url, text, color, underline):
"""
A function that places a hyperlink within a paragraph object.
:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: The hyperlink object
"""
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element
new_run = docx.oxml.shared.OxmlElement('w:r')
# Create a new w:rPr element
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Add color if it is given
if not color is None:
c = docx.oxml.shared.OxmlElement('w:color')
c.set(docx.oxml.shared.qn('w:val'), color)
rPr.append(c)
# Remove underlining if it is requested
if not underline:
u = docx.oxml.shared.OxmlElement('w:u')
u.set(docx.oxml.shared.qn('w:val'), 'none')
rPr.append(u)
# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
return hyperlink
@retry()
def save_docx(data):
document = Document()
document.styles['Normal'].font.name = u'宋体'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
# 获取问题信息,设置标题
questions = data['question']
title = questions['title']
author = questions['author']
answerCount = questions['answerCount']
url = questions['url']
authorUrl = questions['authorUrl']
title_1 = document.add_heading()
title_1.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_2 = document.add_heading(level=2)
title_2.alignment = WD_ALIGN_PARAGRAPH.CENTER
hyperlink_1 = add_hyperlink(title_1, url, title, 'eb1515', False)
hyperlink_2 = add_hyperlink(title_2, authorUrl, author, '856e14', False)
# 写入回答
answers = data['answer']
for answer in answers:
name = answer['author']
name_url = answer['url']
content = answer['content']
imgs = answer['imgs']
title_ = document.add_heading(level=2)
hyperlink = add_hyperlink(title_, name_url, name, '1b8755', False)
p = document.add_paragraph(content)
p.first_line_indent = Inches(-0.25)
p_ = p.add_run()
p_.bold = True
for num, img in enumerate(imgs):
pic = document.add_paragraph()
add_hyperlink(pic, img, f'图片{num + 1}', '3a44cf', False)
document.save('zhihu.docx')
def main():
url = input('输入知乎问题链接:')
zhihu = ZhiHu(url)
res = zhihu.main()
print()
# import pprint
# pprint.pp(res['question'])
# pprint.pp(res['answer'][:3])
# print('\n以上是前三条回答。')
print(f"共获取到{len(res['answer'])}条回答。")
with open('zhihu.json', 'w') as f:
json.dump(res, f)
print('\n文件已保存为zhihu.json')
print('\n正在将数据写入zhihu.docx')
try:
with open('zhihu.json', 'r') as f:
data = json.load(f)
save_docx(data)
print('保存成功。')
print('\n正在删除zhihu.json文件')
os.remove('zhihu.json')
print('已删除zhihu.json')
except Exception as e:
print('保存失败。')
print(e)
return
input()
if __name__ == '__main__':
main() jsonchen 发表于 2023-4-1 23:25
谢谢楼主的神器,省了我很多时间重复造轮子
哈哈,能帮助你就好。其实这个做出来是个失败品,格式没弄好,图片没有插入,很多html代码也没有识别出来,我懒得写就没管了。 好的,谢谢楼主{:1_893:},我用的是python 3.8,问题已经解决了。我把爬下来的结果先保存到MongoDB数据库中,然后再做处理 谢谢分享学习了 感谢分享,想要这个很久了 感谢分享,学习学习 反手就是一个收藏{:1_927:} 多谢分享,收了{:1_893:} mark一下
多谢分享,收了 版主,请问一下你使用的python版本以及docx模块的版本号,万谢 jsonchen 发表于 2023-4-1 22:41
版主,请问一下你使用的python版本以及docx模块的版本号,万谢
docx我不知道了,python用的3.10
页:
[1]
2