知乎问答下载为docx

Arcticlyc · 发表于 2022-9-15 10:09

介绍
浏览知乎时感觉一个问题的回答不断刷新很麻烦，于是写了一个批量爬取。
网盘链接：链接: https://pan.baidu.com/s/11JkNWu1kq-A1e7XfMitu7A?pwd=2dhs 提取码: 2dhs 复制这段内容后打开百度网盘手机App，操作更方便哦

注意
输入的链接形式必须是直达问题，而不是某一回答。
同时由于获取下一页回答的链接在上一页中，因此使用单线程递归获取。
在保存数据的时候，设置docx中的一些格式未能生效，原因不知。

代码块

[Python] 纯文本查看 复制代码

import os
import time
import requests
import re
import json
import logging
from functools import wraps
import docx
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.shared import Inches

# 更改最大递归次数
import sys
sys.setrecursionlimit(3000)


def retry(exception=Exception, tries=3, delay=1, logger=logging):
    '''
    重试装饰器
    :param exception: 异常类型
    :param tries: 重试次数
    :param delay: 重试间隔
    :param logger: 日志对象
    :return:
    '''
    def decorator(f):
        @wraps(f)
        def wrapper(*args, **kwargs):
            _tries = tries
            while _tries > 1:
                try:
                    return f(*args, **kwargs)
                except exception as e:
                    _tries -= 1
                    if logger:
                        logger.error(e)
                    time.sleep(delay)
            return f(*args, **kwargs)
        return wrapper
    return decorator


class ZhiHu:
    def __init__(self, url):
        self.headers = {
            'Host': 'www.zhihu.com',
            'User-Agent': 'Mozilla/5.0'
        }
        self.url = url
        self.answers = []

    
    def get_first(self):
        res = requests.get(self.url, headers=self.headers)

        # 获取下一页链接
        next = re.findall(r'"next":"(http.+?)",', res.text)[0].encode('utf-8').decode("unicode_escape")

        # 解析数据
        data = re.findall(r'<script id="js-initialData" type="text/json">(.+?)</script>', res.text)[0]
        data = json.loads(data)['initialState']['entities']

        # 获取问题信息
        question = list(data['questions'].values())[0]
        title = question['title']
        question_url = question['url'].replace('/api/v4/questions/', '/question/')
        answer_count = question['answerCount']
        author = question['author']['name']
        author_url = question['author']['url'].replace('/api/v4', '')

        self.questions = {
            'title': title,
            'url': question_url,
            'answerCount': answer_count,
            'author': author,
            'authorUrl': author_url,
        }

        # 获取第一页回答
        answer = data['answers']
        for ans in answer:
            name = answer[ans]['author']['name']
            name_url = answer[ans]['author']['url'].replace('/api/v4', '')
            content = answer[ans]['content']
            content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)

            text = re.findall(r'<p.*?>(.*?)</p.*?>', content)

            text = '\n'.join(text).replace('<br/>', '\n')
            text = text.replace('"', '"')
            imgs = re.findall(r'src="(http.+?)"', content)
            self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})

        return next


    @retry()
    def get_later(self, url):
        headers = self.headers.copy()
        headers['Referer'] = self.url
        res = requests.get(url, headers=headers).json()
        
        # 解析基本信息
        is_end = res['paging']['is_end']
        next = res['paging']['next']
        page = res['paging']['page']
        
        # 解析回答数据
        datas = res['data']
        for data in datas:
            name = data['target']['author']['name']
            name_url = data['target']['author']['url'].replace('/api/v4', '')
            content = data['target']['content']
            content = re.sub(r'<figure.*?>.+</figure.*?>', '', content)

            text = re.findall(r'<p.*?>(.*?)</p.*?>', content)

            text = '\n'.join(text).replace('<br/>', '\n')
            imgs = re.findall(r'src="(http.+?)"', content)
            self.answers.append({'author': name, 'url': name_url, 'content': text, 'imgs': list(set(imgs))})
        print(f'\r已获取到{page}页。', end='')
        if is_end or page == self.num:
            return
        
        time.sleep(0.2)
        self.get_later(next)


    def main(self):
        next = self.get_first()
        print('\n已获取到第1页。')
        print(f'\n共有{self.questions["answerCount"]}条回答。')
        self.num = int(input('\n想要获取多少页(每页5条回答，输入0获取全部)：'))
        if self.num != 1:
            self.get_later(next)
        
        return {'question': self.questions, 'answer': self.answers}


# docx文本加超链接
def add_hyperlink(paragraph, url, text, color, underline):
    """
    A function that places a hyperlink within a paragraph object.

    :param paragraph: The paragraph we are adding the hyperlink to.
    :param url: A string containing the required url
    :param text: The text displayed for the url
    :return: The hyperlink object
    """

    # This gets access to the document.xml.rels file and gets a new relation id value
    part = paragraph.part
    r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # Create the w:hyperlink tag and add needed values
    hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
    hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )

    # Create a w:r element
    new_run = docx.oxml.shared.OxmlElement('w:r')

    # Create a new w:rPr element
    rPr = docx.oxml.shared.OxmlElement('w:rPr')

    # Add color if it is given
    if not color is None:
        c = docx.oxml.shared.OxmlElement('w:color')
        c.set(docx.oxml.shared.qn('w:val'), color)
        rPr.append(c)

    # Remove underlining if it is requested
    if not underline:
        u = docx.oxml.shared.OxmlElement('w:u')
        u.set(docx.oxml.shared.qn('w:val'), 'none')
        rPr.append(u)

    # Join all the xml elements together add add the required text to the w:r element
    new_run.append(rPr)
    new_run.text = text
    hyperlink.append(new_run)

    paragraph._p.append(hyperlink)

    return hyperlink


@retry()
def save_docx(data):
    document = Document()
    document.styles['Normal'].font.name = u'宋体'
    document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

    # 获取问题信息，设置标题
    questions = data['question']
    title = questions['title']
    author = questions['author']
    answerCount = questions['answerCount']
    url = questions['url']
    authorUrl = questions['authorUrl']

    title_1 = document.add_heading()
    title_1.alignment = WD_ALIGN_PARAGRAPH.CENTER

    title_2 = document.add_heading(level=2)
    title_2.alignment = WD_ALIGN_PARAGRAPH.CENTER

    hyperlink_1 = add_hyperlink(title_1, url, title, 'eb1515', False)
    hyperlink_2 = add_hyperlink(title_2, authorUrl, author, '856e14', False)

    # 写入回答
    answers = data['answer']
    for answer in answers:
        name = answer['author']
        name_url = answer['url']
        content = answer['content']
        imgs = answer['imgs']

        title_ = document.add_heading(level=2)
        hyperlink = add_hyperlink(title_, name_url, name, '1b8755', False)

        p = document.add_paragraph(content)
        p.first_line_indent = Inches(-0.25)
        p_ = p.add_run()
        p_.bold = True

        for num, img in enumerate(imgs):
            pic = document.add_paragraph()
            add_hyperlink(pic, img, f'图片{num + 1}', '3a44cf', False)
    document.save('zhihu.docx')


def main():
    url = input('输入知乎问题链接：')
    zhihu = ZhiHu(url)

    res = zhihu.main()
    print()

    # import pprint
    # pprint.pp(res['question'])
    # pprint.pp(res['answer'][:3])
    # print('\n以上是前三条回答。')
    print(f"共获取到{len(res['answer'])}条回答。")

    with open('zhihu.json', 'w') as f:
        json.dump(res, f)
    print('\n文件已保存为zhihu.json')

    print('\n正在将数据写入zhihu.docx')
    try:
        with open('zhihu.json', 'r') as f:
            data = json.load(f)
        save_docx(data)
        print('保存成功。')
        print('\n正在删除zhihu.json文件')
        os.remove('zhihu.json')
        print('已删除zhihu.json')

    except Exception as e:
        print('保存失败。')
        print(e)
        return
    
    input()
    



if __name__ == '__main__':
    main()

Arcticlyc · 发表于 2023-4-2 00:51

jsonchen 发表于 2023-4-1 23:25
谢谢楼主的神器，省了我很多时间重复造轮子

哈哈，能帮助你就好。其实这个做出来是个失败品，格式没弄好，图片没有插入，很多html代码也没有识别出来，我懒得写就没管了。

jsonchen · 发表于 2023-4-1 23:20

好的，谢谢楼主

，我用的是python 3.8，问题已经解决了。我把爬下来的结果先保存到MongoDB数据库中，然后再做处理

hkhkhk · 发表于 2022-11-13 13:20

谢谢分享学习了

tan270 · 发表于 2022-11-13 15:12

感谢分享，想要这个很久了

baba333 · 发表于 2022-11-13 16:37

感谢分享，学习学习

qup · 发表于 2022-11-13 21:48

反手就是一个收藏

allophus · 发表于 2022-12-1 08:28

多谢分享，收了

嬉皮笑脸 · 发表于 2022-12-14 23:22

mark一下

jsonchen · 发表于 2023-4-1 22:35

多谢分享，收了

jsonchen · 发表于 2023-4-1 22:41

版主，请问一下你使用的python版本以及docx模块的版本号，万谢

Arcticlyc · 发表于 2023-4-1 23:18

jsonchen 发表于 2023-4-1 22:41
版主，请问一下你使用的python版本以及docx模块的版本号，万谢

docx我不知道了，python用的3.10

帐号		自动登录	找回密码
密码			注册[Register]

[Python 转载] 知乎问答下载为docx

免费评分