Python获取某笔错题

jackson123456 · 发表于 2024-11-15 08:31

本帖最后由 jackson123456 于 2024-11-15 08:33 编辑

考公的时候做了不少错题，使用某笔app想要导出的时候提示我只能导出10题，我天错的几千道题目不得得几十天。开会员又太贵了，全职备考没钱。通过对web的分析对内容错题进行爬取
1.登录部分抓取cookies（不会分析如和获取cookies 所以每次我都是半自动爬虫）
2.点击个人中心发现错题前面的url都一样只不过是后面请求的id不一样（一次10个都是错题的id）
3.请求后对其进行获取和正则处理
4.写在txt里面方便我进行修改和打印
不足之处：
1.对json的节点取的时候没有完善要到代码里面修改
2.有些题目是图片题只获取了url没有下载
3.请求的cookies网页一定是错题界面那个url不然获取不到的

大佬们遇到有加密的网页参数应该如何逆向每次遇到这些我都不会断点也不会打
希望各位大佬不吝赐教学习学习

[Python] 纯文本查看 复制代码

import requestsfrom bs4 import BeautifulSoup

def fetch_data(cookie_value) :#数据请求获取json数据
    url = "https://tiku.fenbi.com/api/xingce/errors/keypoint-tree"
    headers = {
        "Host" : "tiku.fenbi.com",
        "Connection" : "keep-alive",
        "sec-ch-ua-platform" : "\"Windows\"",
        "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
        "Accept" : "application/json, text/plain, */*",
        "sec-ch-ua" : "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
        "sec-ch-ua-mobile" : "?0",
        "Origin" : "https://www.fenbi.com",
        "Sec-Fetch-Site" : "same-site",
        "Sec-Fetch-Mode" : "cors",
        "Sec-Fetch-Dest" : "empty",
        "Referer" : "https://www.fenbi.com/",
        "Accept-Encoding" : "gzip, deflate, br, zstd",
        "Accept-Language" : "zh-CN,zh;q=0.9",
        "Cookie" : cookie_value
    }
    params = {
        "timeRange" : "0",
        "order" : "desc",
        "app" : "web",
        "kav" : "100",
        "av" : "100",
        "hav" : "100",
        "version" : "3.0.0.0"
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200 :
        print("Request was successful.")
        return response.json()  # 返回 JSON 数据
    else :
        print(f"Request failed with status code: {response.status_code}")
        return None#数据请求获取#
def extract_question_ids(data) :
    try :
        # 获取第一个元素的 'children' 列表
        first_children = data[0]['children']

        # 获取第一个元素的 'children' 列表中的第三个子节点
        third_child = first_children[2]

        # 获取第三个子节点中的 'questionIds'
        question_ids = third_child['questionIds']

        return question_ids
    except (IndexError, KeyError, TypeError) as e :
        print(f"Error extracting question IDs: {e}")
        return None
def download_question(questions_id) :
    url = "https://tiku.fenbi.com/api/xingce/universal/auth/solutions"

    headers = {
        "Host" : "tiku.fenbi.com",
        "Connection" : "keep-alive",
        "sec-ch-ua-platform" : "\"Windows\"",
        "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
        "Accept" : "application/json, text/plain, */*",
        "sec-ch-ua" : "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
        "sec-ch-ua-mobile" : "?0",
        "Origin" : "https://www.fenbi.com",
        "Sec-Fetch-Site" : "same-site",
        "Sec-Fetch-Mode" : "cors",
        "Sec-Fetch-Dest" : "empty",
        "Referer" : "https://www.fenbi.com/",
        "Accept-Encoding" : "gzip, deflate, br, zstd",
        "Accept-Language" : "zh-CN,zh;q=0.9"
    }

    # 将所有结果存储到一个列表中
    all_results = []

    # 分批处理，每次取 10 个 questionIds
    for i in range(0, len(questions_id), 10) :
        # 获取当前批次的 questionIds，最多 10 个
        batch_question_ids = questions_id[i :i + 10]

        # 构造请求参数
        params = {
            "type" : "1",
            "questionIds" : ",".join(map(str, batch_question_ids)),
            "app" : "web",
            "kav" : "100",
            "av" : "100",
            "hav" : "100",
            "version" : "3.0.0.0"
        }

        # 设置 cookies
        cookies_dict = {"Cookie" : cookie_value}

        # 发送 GET 请求
        response = requests.get(url, headers=headers, params=params, cookies=cookies_dict)

        # 检查响应状态码
        if response.status_code == 200 :
            # 提取 JSON 数据
            data = response.json().get("solutions", [])

            for solution in data :
                # 获取 content，并替换<u>\xa0 \xa0 \xa0 \xa0 \xa0 \xa0\xa0</u>为下划线
                content = solution.get("content", "N/A")
                content = content.replace('<u>\xa0 \xa0 \xa0 \xa0 \xa0 \xa0\xa0</u>', '_______')

                # 使用BeautifulSoup移除HTML标签
                soup = BeautifulSoup(content, "html.parser")
                content_text = soup.get_text()

                # 获取 options 并格式化
                options = solution.get("accessories", [])
                formatted_options = []
                if options :
                    options = options[0].get("options", [])
                    for idx, option in enumerate(options[:4]) :  # 只取前4个选项
                        formatted_options.append(f"{chr(65 + idx)}. {option}")

                # 获取 solution 解析，并移除 HTML 标签
                solution_text = solution.get("solution", "N/A")
                solution_soup = BeautifulSoup(solution_text, "html.parser")
                solution_text_clean = solution_soup.get_text()

                # 添加到结果列表
                all_results.append({
                    "content" : content_text,
                    "options" : formatted_options if formatted_options else ["N/A"],
                    "solution" : solution_text_clean
                })
        else :
            print(f"Error: {response.status_code} for batch starting at index {i}")

    return all_results
def format_data(all_results):
    for item in all_results:
        print(f"题目: '{item['content']}'")
        # 输出选项，每个选项换行
        print("选项:")
        for option in item['options']:
            print(option)
        print(f"解析: '{item['solution']}'\n")
def save_to_txt(all_results, filename="./questions_output.txt") :
    # 打开文件（如果文件不存在，则创建文件）
    with open(filename, "w", encoding="utf-8") as file :
        for item in all_results :
            # 写入题目
            file.write(f"题目: {item['content']}\n")

            # 写入选项
            file.write("选项:\n")
            for option in item['options'] :
                file.write(f"{option}\n")

            # 写入解析
            file.write(f"解析: {item['solution']}\n\n")

    print(f"数据已成功保存到 {filename}")

# 示例使用
# 假设 data 是从请求中获得的 JSON 数据
# data = fetch_data(cookie_value)  # 获取完整 JSON 数据
# question_ids = extract_question_ids(data)
# if question_ids:
#     print(question_ids)


# 示例调用
cookie_value=input('请输入粉笔的cookies：')
#cookie_value = "sid=2274274; persistent=Z2YQVc3CD10qPzS5J6VBtGCgjL95rOJaU2L0GvuZrmiLxKDDakc8qC8V5fec4gV8EqklslJj9Qo/jFb2ocR2CA==; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2285542255%22%2C%22first_id%22%3A%22192f780420c45c-0f509bc75dbb08-26011951-2359296-192f780420d743%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22192f780420c45c-0f509bc75dbb08-26011951-2359296-192f780420d743%22%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkyZjc4MGNkNzcyNDUtMGMyODljYzJhZmI5MzQ4LTI2MDExOTUxLTIzNTkyOTYtMTkyZjc4MGNkNzhmMWQiLCIkaWRlbnRpdHlfbG9naW5faWQiOiI4NTU0MjI1NSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%2285542255%22%7D%7D; Hm_lvt_e7351028cde0d0ccb9ccdbe5fe531683=1730729905; sess=Y07FZsfRodNT5tXF2DuEajrIUaZ1VqqaYGTh/mKLABcb90kFFS7RBr8eSYJuHuMiCvSBKHSWWJPqL3GBvjKwEqJYHODZFvGbBklhNbg2zjk=; userid=85542255; acw_tc=0b6e703a17312861178965341efaae7b9e3e229acdfb9d70c7987b4573af7e"
data = fetch_data(cookie_value)
print('')
if data :
    questions_id=extract_question_ids(data)#获取问题ID
    problems=download_question(questions_id)
    save_to_txt(problems)
    print(format_data(problems))