Python获取某笔错题
本帖最后由 jackson123456 于 2024-11-15 08:33 编辑考公的时候做了不少错题,使用某笔app想要导出的时候提示我只能导出10题,我天错的几千道题目不得得几十天。开会员又太贵了,全职备考没钱。通过对web的分析对内容错题进行爬取
1.登录部分抓取cookies(不会分析如和获取cookies 所以每次我都是半自动爬虫)
2.点击个人中心发现错题前面的url都一样只不过是后面请求的id不一样 (一次10个都是错题的id)
3.请求后对其进行获取和正则处理
4.写在txt里面方便我进行修改和打印
不足之处:
1.对json的节点取的时候没有完善要到代码里面修改
2.有些题目是图片题 只获取了url没有下载
3.请求的cookies网页一定是 错题界面那个url不然获取不到的
大佬们遇到有加密的网页参数应该如何逆向每次遇到这些我都不会 断点也不会打
希望各位大佬不吝赐教 学习学习import requestsfrom bs4 import BeautifulSoup
def fetch_data(cookie_value) :#数据请求获取json数据
url = "https://tiku.fenbi.com/api/xingce/errors/keypoint-tree"
headers = {
"Host" : "tiku.fenbi.com",
"Connection" : "keep-alive",
"sec-ch-ua-platform" : "\"Windows\"",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"Accept" : "application/json, text/plain, */*",
"sec-ch-ua" : "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
"sec-ch-ua-mobile" : "?0",
"Origin" : "https://www.fenbi.com",
"Sec-Fetch-Site" : "same-site",
"Sec-Fetch-Mode" : "cors",
"Sec-Fetch-Dest" : "empty",
"Referer" : "https://www.fenbi.com/",
"Accept-Encoding" : "gzip, deflate, br, zstd",
"Accept-Language" : "zh-CN,zh;q=0.9",
"Cookie" : cookie_value
}
params = {
"timeRange" : "0",
"order" : "desc",
"app" : "web",
"kav" : "100",
"av" : "100",
"hav" : "100",
"version" : "3.0.0.0"
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200 :
print("Request was successful.")
return response.json()# 返回 JSON 数据
else :
print(f"Request failed with status code: {response.status_code}")
return None#数据请求获取#
def extract_question_ids(data) :
try :
# 获取第一个元素的 'children' 列表
first_children = data['children']
# 获取第一个元素的 'children' 列表中的第三个子节点
third_child = first_children
# 获取第三个子节点中的 'questionIds'
question_ids = third_child['questionIds']
return question_ids
except (IndexError, KeyError, TypeError) as e :
print(f"Error extracting question IDs: {e}")
return None
def download_question(questions_id) :
url = "https://tiku.fenbi.com/api/xingce/universal/auth/solutions"
headers = {
"Host" : "tiku.fenbi.com",
"Connection" : "keep-alive",
"sec-ch-ua-platform" : "\"Windows\"",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"Accept" : "application/json, text/plain, */*",
"sec-ch-ua" : "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
"sec-ch-ua-mobile" : "?0",
"Origin" : "https://www.fenbi.com",
"Sec-Fetch-Site" : "same-site",
"Sec-Fetch-Mode" : "cors",
"Sec-Fetch-Dest" : "empty",
"Referer" : "https://www.fenbi.com/",
"Accept-Encoding" : "gzip, deflate, br, zstd",
"Accept-Language" : "zh-CN,zh;q=0.9"
}
# 将所有结果存储到一个列表中
all_results = []
# 分批处理,每次取 10 个 questionIds
for i in range(0, len(questions_id), 10) :
# 获取当前批次的 questionIds,最多 10 个
batch_question_ids = questions_id
# 构造请求参数
params = {
"type" : "1",
"questionIds" : ",".join(map(str, batch_question_ids)),
"app" : "web",
"kav" : "100",
"av" : "100",
"hav" : "100",
"version" : "3.0.0.0"
}
# 设置 cookies
cookies_dict = {"Cookie" : cookie_value}
# 发送 GET 请求
response = requests.get(url, headers=headers, params=params, cookies=cookies_dict)
# 检查响应状态码
if response.status_code == 200 :
# 提取 JSON 数据
data = response.json().get("solutions", [])
for solution in data :
# 获取 content,并替换<u>\xa0 \xa0 \xa0 \xa0 \xa0 \xa0\xa0</u>为下划线
content = solution.get("content", "N/A")
content = content.replace('<u>\xa0 \xa0 \xa0 \xa0 \xa0 \xa0\xa0</u>', '_______')
# 使用BeautifulSoup移除HTML标签
soup = BeautifulSoup(content, "html.parser")
content_text = soup.get_text()
# 获取 options 并格式化
options = solution.get("accessories", [])
formatted_options = []
if options :
options = options.get("options", [])
for idx, option in enumerate(options[:4]) :# 只取前4个选项
formatted_options.append(f"{chr(65 + idx)}. {option}")
# 获取 solution 解析,并移除 HTML 标签
solution_text = solution.get("solution", "N/A")
solution_soup = BeautifulSoup(solution_text, "html.parser")
solution_text_clean = solution_soup.get_text()
# 添加到结果列表
all_results.append({
"content" : content_text,
"options" : formatted_options if formatted_options else ["N/A"],
"solution" : solution_text_clean
})
else :
print(f"Error: {response.status_code} for batch starting at index {i}")
return all_results
def format_data(all_results):
for item in all_results:
print(f"题目: '{item['content']}'")
# 输出选项,每个选项换行
print("选项:")
for option in item['options']:
print(option)
print(f"解析: '{item['solution']}'\n")
def save_to_txt(all_results, filename="./questions_output.txt") :
# 打开文件(如果文件不存在,则创建文件)
with open(filename, "w", encoding="utf-8") as file :
for item in all_results :
# 写入题目
file.write(f"题目: {item['content']}\n")
# 写入选项
file.write("选项:\n")
for option in item['options'] :
file.write(f"{option}\n")
# 写入解析
file.write(f"解析: {item['solution']}\n\n")
print(f"数据已成功保存到 {filename}")
# 示例使用
# 假设 data 是从请求中获得的 JSON 数据
# data = fetch_data(cookie_value)# 获取完整 JSON 数据
# question_ids = extract_question_ids(data)
# if question_ids:
# print(question_ids)
# 示例调用
cookie_value=input('请输入粉笔的cookies:')
#cookie_value = "sid=2274274; persistent=Z2YQVc3CD10qPzS5J6VBtGCgjL95rOJaU2L0GvuZrmiLxKDDakc8qC8V5fec4gV8EqklslJj9Qo/jFb2ocR2CA==; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2285542255%22%2C%22first_id%22%3A%22192f780420c45c-0f509bc75dbb08-26011951-2359296-192f780420d743%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22192f780420c45c-0f509bc75dbb08-26011951-2359296-192f780420d743%22%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTkyZjc4MGNkNzcyNDUtMGMyODljYzJhZmI5MzQ4LTI2MDExOTUxLTIzNTkyOTYtMTkyZjc4MGNkNzhmMWQiLCIkaWRlbnRpdHlfbG9naW5faWQiOiI4NTU0MjI1NSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%2285542255%22%7D%7D; Hm_lvt_e7351028cde0d0ccb9ccdbe5fe531683=1730729905; sess=Y07FZsfRodNT5tXF2DuEajrIUaZ1VqqaYGTh/mKLABcb90kFFS7RBr8eSYJuHuMiCvSBKHSWWJPqL3GBvjKwEqJYHODZFvGbBklhNbg2zjk=; userid=85542255; acw_tc=0b6e703a17312861178965341efaae7b9e3e229acdfb9d70c7987b4573af7e"
data = fetch_data(cookie_value)
print('')
if data :
questions_id=extract_question_ids(data)#获取问题ID
problems=download_question(questions_id)
save_to_txt(problems)
print(format_data(problems)) 好家伙,用用看 学习一下 学习一下 学习了,这种返回的html还是要用beautifulsuop才舒服 学习一下
好东西啊 Python做数据抓取还是很爽的 学习思路,谢谢。 牛逼克拉斯,希望能做成GUI界面