[Python] 纯文本查看 复制代码
import os
import time
import requests
import re
import json
import logging
from functools import wraps
import docx
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Inches
# 更改最大递归次数
import sys
sys.setrecursionlimit(3000)
def retry(exception=Exception, tries=3, delay=1, logger=logging):
'''
重试装饰器
:param exception: 异常类型
:param tries: 重试次数
:param delay: 重试间隔
:param logger: 日志对象
:return:
'''
def decorator(f):
@wraps(f)
def wrapper(*args, **kwargs):
_tries = tries
while _tries > 1:
try:
return f(*args, **kwargs)
except exception as e:
_tries -= 1
if logger:
logger.error(e)
time.sleep(delay)
return f(*args, **kwargs)
return wrapper
return decorator
class ZhiHu:
def __init__(self, url):
self.headers = {
'Host': 'www.zhihu.com',
'User-Agent': 'Mozilla/5.0'
}
self.url = url
self.answers = []
def get_first(self):
res = requests.get(self.url, headers=self.headers)
# 获取下一页链接
next = re.findall(r'"next":"(http.+?)",', res.text)[0].encode('utf-8').decode("unicode_escape")
# 解析数据
data = re.findall(r'<script id="js-initialData" type="text/json">(.+?)</script>', res.text)[0]
data = json.loads(data)['initialState']['entities']
# 获取问题信息
question = list(data['questions'].values())[0]
title = question['title']
question_url = question['url'].replace('/api/v4/questions/', '/question/')
answer_count = question['answerCount']
author = question['author']['name']
author_url = question['author']['url'].replace('/api/v4', '')
self.questions = {
'title': title,
'url': question_url,
'answerCount': answer_count,
'author': author,
'authorUrl': author_url,
}
# 获取第一页回答
answer = data['answers']
for ans in answer:
name = answer[ans]['author']['name']
name_url = answer[ans]['author']['url'].replace('/api/v4', '')
content = answer[ans]['content']
content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)
text = re.findall(r'<p.*?>(.*?)</p.*?>', content)
text = '\n'.join(text).replace('<br/>', '\n')
text = text.replace('"', '"')
imgs = re.findall(r'src="(http.+?)"', content)
self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
return next
@retry()
def get_later(self, url):
headers = self.headers.copy()
headers['Referer'] = self.url
res = requests.get(url, headers=headers).json()
# 解析基本信息
is_end = res['paging']['is_end']
next = res['paging']['next']
page = res['paging']['page']
# 解析回答数据
datas = res['data']
for data in datas:
name = data['target']['author']['name']
name_url = data['target']['author']['url'].replace('/api/v4', '')
content = data['target']['content']
content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)
text = re.findall(r'<p.*?>(.*?)</p.*?>', content)
text = '\n'.join(text).replace('<br/>', '\n')
imgs = re.findall(r'src="(http.+?)"', content)
self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
print(f'\r已获取到{page}页。', end='')
if is_end or page == self.num:
return
time.sleep(0.2)
self.get_later(next)
def main(self):
next = self.get_first()
print('\n已获取到第1页。')
print(f'\n共有{self.questions["answerCount"]}条回答。')
self.num = int(input('\n想要获取多少页(每页5条回答,输入0获取全部):'))
if self.num != 1:
self.get_later(next)
return {'question': self.questions, 'answer': self.answers}
# docx文本加超链接
def add_hyperlink(paragraph, url, text, color, underline):
"""
A function that places a hyperlink within a paragraph object.
:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: The hyperlink object
"""
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element
new_run = docx.oxml.shared.OxmlElement('w:r')
# Create a new w:rPr element
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Add color if it is given
if not color is None:
c = docx.oxml.shared.OxmlElement('w:color')
c.set(docx.oxml.shared.qn('w:val'), color)
rPr.append(c)
# Remove underlining if it is requested
if not underline:
u = docx.oxml.shared.OxmlElement('w:u')
u.set(docx.oxml.shared.qn('w:val'), 'none')
rPr.append(u)
# Join all the xml elements together add add the required text to the w:r element
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
return hyperlink
@retry()
def save_docx(data):
document = Document()
document.styles['Normal'].font.name = u'宋体'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
# 获取问题信息,设置标题
questions = data['question']
title = questions['title']
author = questions['author']
answerCount = questions['answerCount']
url = questions['url']
authorUrl = questions['authorUrl']
title_1 = document.add_heading()
title_1.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_2 = document.add_heading(level=2)
title_2.alignment = WD_ALIGN_PARAGRAPH.CENTER
hyperlink_1 = add_hyperlink(title_1, url, title, 'eb1515', False)
hyperlink_2 = add_hyperlink(title_2, authorUrl, author, '856e14', False)
# 写入回答
answers = data['answer']
for answer in answers:
name = answer['author']
name_url = answer['url']
content = answer['content']
imgs = answer['imgs']
title_ = document.add_heading(level=2)
hyperlink = add_hyperlink(title_, name_url, name, '1b8755', False)
p = document.add_paragraph(content)
p.first_line_indent = Inches(-0.25)
p_ = p.add_run()
p_.bold = True
for num, img in enumerate(imgs):
pic = document.add_paragraph()
add_hyperlink(pic, img, f'图片{num + 1}', '3a44cf', False)
document.save('zhihu.docx')
def main():
url = input('输入知乎问题链接:')
zhihu = ZhiHu(url)
res = zhihu.main()
print()
# import pprint
# pprint.pp(res['question'])
# pprint.pp(res['answer'][:3])
# print('\n以上是前三条回答。')
print(f"共获取到{len(res['answer'])}条回答。")
with open('zhihu.json', 'w') as f:
json.dump(res, f)
print('\n文件已保存为zhihu.json')
print('\n正在将数据写入zhihu.docx')
try:
with open('zhihu.json', 'r') as f:
data = json.load(f)
save_docx(data)
print('保存成功。')
print('\n正在删除zhihu.json文件')
os.remove('zhihu.json')
print('已删除zhihu.json')
except Exception as e:
print('保存失败。')
print(e)
return
input()
if __name__ == '__main__':
main()