考试需要用到时政知识,发现时事一点通的时政内容比较好,还有练习题和答案。就打算爬取它的每日时政的内容、练习题和答案。
开始打算直接用requests解析,发现练习题需要登录账号才能获取,还有数据响应超时,加上headers也不起作用。
然后用谷歌的无头浏览器,无法定位答案页的提交答案按钮。
研究了好几天,最后决定用360浏览器进行自动操作。终于成功了。
安装好必要的包,代码里面带星号的网址改成网站的网址,就可以成功执行的。
大家可以实际操作一下,可以看到效果。
[Python] 纯文本查看 复制代码 from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from pyquery import PyQuery as pq
from time import sleep
import requests
import re
import json
#打开360安全浏览器进行自动化操作
__browser_url = r'C:\Users\Administrator\AppData\Roaming\360se6\Application\360se.exe' ##360浏览器的安装地址
chrome_options = Options()
chrome_options.binary_location = __browser_url
driver = webdriver.Chrome(chrome_options=chrome_options)
#登录账号,获取列表页链接
def get_liebiao():
driver.get('http://www.*****.com/login')#登录页面
driver.find_element_by_css_selector('#tab-tabPwd').click()
sleep(10)
driver.find_element_by_css_selector('#pane-tabPwd > form > div:nth-child(1) > div > div > input').send_keys("**********")#账号名
driver.find_element_by_name('password').send_keys("*********")#密码
driver.find_element_by_css_selector('#pane-tabPwd > form > div:nth-child(3) > div > button > span').click()
sleep(5)
driver.find_element_by_css_selector('#app > header > div.container > ul > li:nth-child(3) > a').click()#自动跳转到首页,点击首页的资讯
sleep(5)
for i in range(1):#连续点击更多,获取你想要的日期内容,我设置的是1,因为我是一个月一个月保存的。刚开始我要爬一年的,设置的事是13.
driver.find_element_by_css_selector('#app > div.container > div > div.el-col.el-col-24.el-col-xs-24.el-col-sm-24.el-col-md-16 > div > button').click()
sleep(5)
html = driver.page_source
pattern = re.compile('class="article-item".*?href="(.*?)" target="_blank" title="(.*?)">', re.S)
kaodians = re.findall(pattern, html)
for i in range(len(kaodians)-1,-1,-1):#获取的链接是从最近的日期开始的,我想让日期从远到近,就进行了倒序处理。
yield {
'biaoti': kaodians[i][1],
'kaodian_url': 'http://www.*****.com'+kaodians[i][0]#整理内日时事的链接
}
def write_to_file(wenjian, content):
with open(wenjian+'.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')#数据写入TXT文件,考点、题库和答案放在不同的文件中
f.close()
def get_timu(html):#爬取练习题,用的正则
pattern_2 = re.compile(
'<h3 data-v-238eeb99="">(.*?)<em data-v-238eeb99="">(.*?)</em>(.*?)</h3>.*?'
+ '<span data-v-238eeb99="">(.*?)</span> <div data-v-238eeb99="">(.*?)</div>.*?'
+ '<span data-v-238eeb99="">(.*?)</span> <div data-v-238eeb99="">(.*?)</div>.*?'
+ '<span data-v-238eeb99="">(.*?)</span> <div data-v-238eeb99="">(.*?)</div>.*?'
+ '<span data-v-238eeb99="">(.*?)</span> <div data-v-238eeb99="">(.*?)</div>', re.S)
items = re.findall(pattern_2, html)
for item in items:
yield {
'timu': item[0] + item[1] + item[2],#题号+类型+题干
'x1': item[3] + '、' + item[4],#四个选项
'x2': item[5] + '、' + item[6],
'x3': item[7] + '、' + item[8],
'x4': item[9] + '、' + item[10]
}
def get_daan(html):#爬取答案
pattern_2 = re.compile(
'<h3 data-v-1c52a667="">(.*?)<em.*?option right"><span data-v-1c52a667="">(.*?)</span> <div data-v-1c52a667="">(.*?)</div>',
re.S)
items_daan = re.findall(pattern_2, html)
for item_daan in items_daan:
str_daan = item_daan[0] + item_daan[1] + item_daan[2]
write_to_file('daan', '' + str_daan + '')
def get_kaodian(kaodian):#爬取每天的时事内容
js='window.open("'+kaodian["kaodian_url"]+'");'
driver.execute_script(js)
driver.switch_to_window(driver.window_handles[1])
sleep(5)
html = driver.page_source
items = driver.find_elements_by_tag_name('p')
write_to_file('kaodian', '')
write_to_file('kaodian','' + kaodian["biaoti"] + '')
write_to_file('kaodian', '')
for item in items:
write_to_file('kaodian','[[' + item.text + ']]')
driver.find_element_by_css_selector('#app > div.container > div > div.el-col.el-col-24.el-col-xs-24.el-col-sm-24.el-col-md-16 > div > button > span').click()
sleep(5)
if 'p_type=02' in driver.current_url:#爬取时出了一点小问题,所以加了这个判断
html = driver.page_source
write_to_file('tiku', '')
write_to_file('tiku', '' + '测试:'+kaodian["biaoti"] + '')
write_to_file('tiku', '')
for item in get_timu(html):
write_to_file('tiku',''+item['timu']+'')
write_to_file('tiku',''+item['x1']+'')
write_to_file('tiku',''+item['x2']+'')
write_to_file('tiku',''+item['x3']+'')
write_to_file('tiku',''+item['x4']+'')
move = driver.find_elements_by_xpath("//button")[1]
ActionChains(driver).move_to_element(move).perform()
sleep(5)
driver.find_elements_by_xpath("//button")[1].click()
sleep(5)
driver.find_element_by_css_selector(
'body > div.el-message-box__wrapper > div > div.el-message-box__btns > button.el-button.el-button--default.el-button--small.el-button--primary').click()
sleep(5)
write_to_file('daan', '')
write_to_file('daan', '' + '测试答案:'+kaodian["biaoti"] + '')
write_to_file('daan', '')
get_daan(driver.page_source)
driver.close()
driver.switch_to_window(driver.window_handles[0])
def main():
for kaodian in get_liebiao():
get_kaodian(kaodian)
driver.quit()
if __name__ == '__main__':
main()
|