[Python] 纯文本查看 复制代码
import re
import requests
from lxml import etree
from selenium.webdriver import Firefox
from selenium.webdriver.support.select import Select
from selenium.webdriver.firefox.options import Options
import json
from jsonpath import jsonpath
import time
class You(object):
def __init__(self,url):
self.name = None
self.tail = None
self.url = url
self.headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection":"keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Authorization":'这里填写',
}
Cookie = '这里填cookies'
self.cookie = {
'Cookie':Cookie.encode('utf-8').decode("latin1")
}
def get_answer(self,parentId,quetionID):
url ='https://api.ulearning.cn/questionAnswer/'+str(quetionID)+'?parentId='+str(parentId)
print(url)
answerData = requests.get(url,headers=self.headers,cookies=self.cookie)
answerJson = json.loads(answerData.content.decode())
print(answerJson)
try:
answer = answerJson['correctAnswerList']
idx = 1
self.f.write(' 答案: ')
if len(answer) == 1:
self.f.write(answer[0]+" \n")
else:
for ans in answer:
self.f.write(str(idx)+'. '+ans+' ')
idx = idx+1
self.f.write('\n')
except KeyError:
return ''
def DataReplace(self,text):
text = text.replace('<br>','\n')
Re = re.compile('<.*?>',re.S)
text = re.sub(Re,'',text)
text = text.replace('<strong>','').replace(' ','').replace('</strong>','')
return text
def get_data(self):
html_json = json.loads(self.text.content.decode())
#$.wholepageItemDTOList[1].wholepageDTOList[1].content'
wholepageItemDTOList = jsonpath(html_json,'$.wholepageItemDTOList.')
for wholepageItemDTOItem in wholepageItemDTOList[0]:
for wholepageDTOList in wholepageItemDTOItem['wholepageDTOList']:
if wholepageDTOList['content'] == 'Unit Objective':
break
else:
try:
partenID = wholepageDTOList['id']
except KeyError:
pass
for coursepageDTOItem in wholepageDTOList['coursepageDTOList']:
if 'questionDTOList' in coursepageDTOItem:
for questionItem in coursepageDTOItem['questionDTOList']:
titleID = 1
try:
questionID = questionItem['questionid']
except KeyError:
pass
title = self.DataReplace(questionItem['title'])
if title == '':
break
self.f.write(str(titleID)+title+'\n')
titleID = titleID+1
#choiceitemModels
try:
idx = 'A'
for choiceitemModels in questionItem['choiceitemModels']:
choiceitemModelsTitle = self.DataReplace(choiceitemModels['title'])
questionID = choiceitemModels['questionid']
self.f.write(idx+'. '+choiceitemModelsTitle+' ')
idx = chr(ord(idx)+1)
self.f.write('\n')
except:
pass
#subQuestionModels层
try:
idx = 1
for subQuestionModelsItem in questionItem['subQuestionModels']:
questionID = subQuestionModelsItem['questionid']
subQuestionModelsTitle = self.DataReplace(subQuestionModelsItem['title'])
# if subQuestionModelsTitle == '':
# break
self.f.write(' ('+str(idx)+'). '+subQuestionModelsTitle+'\n')
idx = idx+1
try:
choiceitemModelsIDX = 'A'
for choiceitemModelsItem in subQuestionModelsItem['choiceitemModels']:
choiceitemModelsTitle = self.DataReplace(choiceitemModelsItem['title'])
if choiceitemModelsTitle == '':
continue
self.f.write(' '+choiceitemModelsIDX+'. '+choiceitemModelsTitle+'\n')
choiceitemModelsIDX = chr(ord(choiceitemModelsIDX)+1)
try:
self.get_answer(parentId=partenID,quetionID=questionID)
except UnboundLocalError:
pass
except KeyError:
self.get_answer(parentId=partenID,quetionID=questionID)
except KeyError:
self.get_answer(parentId=partenID, quetionID=questionID)
def get_url(self,classId,id):
url = 'https://api.ulearning.cn/course/stu/'+str(id)+'/directory?classId='+str(classId)
text = requests.get(url,headers=self.headers,cookies=self.cookie)
print(text)
urlJson = json.loads(text.content.decode())
chapters = jsonpath(urlJson,'$.chapters.')
for chaptersItem in chapters[0]:
print('等待中')
time.sleep(10)
Unilt = self.DataReplace(chaptersItem['nodetitle'])
self.f.write(' '+Unilt+'\n')
nodeid = chaptersItem['nodeid']
ObjectUrl = 'https://api.ulearning.cn/wholepage/chapter/stu/'+str(nodeid)
text = requests.get(ObjectUrl, headers=self.headers, cookies=self.cookie)
text.encoding = 'utf-8'
self.text = text
self.get_data()
def init_url(self):
initNum = re.findall('.*?courseId=(\d+)', self.url)
url = 'https://courseapi.ulearning.cn/classes/information/student/' + initNum[0] + '?lang=zh'
classReponce = requests.get(url, headers=self.headers, cookies=self.cookie)
classId = re.findall('<classId>(\d+)</classId>', classReponce.text)
courseUrl = 'https://courseapi.ulearning.cn/textbook/student/' + initNum[0] + '/list?lang=zh'
courseRes = requests.get(courseUrl, headers=self.headers, cookies=self.cookie)
courseList = re.findall('<courseId>(\d+)</courseId><name>(.*?)</name>', courseRes.text)
for item in courseList:
print(item[0] + ' ' + item[1])
# 是否爬取全部课程
# self.name = item[1]
# self.f = open(self.name + '.txt', 'a', encoding='UTF-8')
# self.get_url(classId[0],item[0])
#爬取单个课程 根据输出的列表中的数字把11451替换了
self.name = '新交互大学英语1(第2版)New Interactive College English 1 (2.0)'#修改名字,可以随便改
self.f = open(self.name + '.txt', 'a', encoding='UTF-8')
self.get_url(classId[0],'11451')
def run(self):
self.init_url()
if __name__ == '__main__':
# 替换这个url
url = 'https://courseweb.ulearning.cn/ulearning/index.html#/course/textbook?courseId=93790'
y = You(url)
y.run()