好友
阅读权限10
听众
最后登录1970-1-1
|
# -*- coding: utf-8 -*-
# @file :zhihu-spyder.py
import time
import xlrd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os
import excelSave as save
book_name_xls = "zhihu_data_0820.xls" # 填写你想存放excel的路径,没有文件会自动创建
sheet_name_xls = '知乎数据' # sheet表名
# 用来控制页面滚动
def Transfer_Clicks(browser):
# time.sleep(5)
try:
browser.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
time.sleep(5)
except:
pass
return "Transfer successfully \n"
def insert_data(driver, answers, user_proposed):
workbook = xlrd.open_workbook(book_name_xls) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
# rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
# rid = rows_old
title_tag = driver.find_elements_by_xpath("//*[@class='QuestionHeader-title']")
question_title = title_tag[0].text # 问题帖标题
# users = driver.find_elements_by_xpath("//*[@class='UserLink-link']")
users = driver.find_elements_by_xpath("//*[@class='Avatar AuthorInfo-avatar']") # get_attribute
agrees = driver.find_elements_by_xpath("//*[@class='Button VoteButton VoteButton--up']") # get_attribute
times = driver.find_elements_by_xpath("//*[@class='ContentItem-time']/a/span") # get_attribute
# print(len(user), ' ', len(agree), ' ', len(time))
for num in range(0, len(answers)):
u_num = num
if user_proposed == 1:
u_num = num + 1
username = users[u_num].get_attribute('alt')
answer_content = answers[num].text
answer_time = times[num].get_attribute('data-tooltip')
agree = agrees[num].get_attribute('aria-label')
value1 = [ [num+1, question_title, username, answer_content, answer_time, agree] ]
print("当前插入第%d条数据" % (num+1))
save.write_excel_xls_append_norepeat(book_name_xls, value1)
# 获取当前页面的数据
def get_allAnswers(driver, user_proposed):
after = 0
n = 0
while True:
before = after
Transfer_Clicks(driver)
# time.sleep(3)
answers = driver.find_elements_by_xpath("//*[@class='RichText ztext CopyrightRichText-richText']")
print("当前包含回答最大数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的回答" % (len(answers), n))
after = len(answers)
if after > before:
n = 0
if after == before:
n = n + 1
if n == 5:
print("当前问题最大回答数为:%d" % after)
insert_data(driver, answers, user_proposed)
break
"""
if len(answers) > maxQuestion:
print("当前回答数以达到%d条" % maxQuestion)
insert_data(driver, answers, user_proposed)
break
"""
def get_allQuestions(driver, maxQuestion):
# 开始爬取数据
after = 0
n = 0
while True:
before = after
Transfer_Clicks(driver)
time.sleep(3)
question_link_tag = driver.find_elements_by_xpath("//*[@class='ContentItem AnswerItem']/h2/div/a")
itemlen = len(question_link_tag)
print("当前包含最大问题数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的条目" % (itemlen, n))
after = itemlen
if after > before:
n = 0
if after == before:
n = n + 1
if n == 5:
print("当前关键词最大条目数为:%d" % after)
break
if itemlen > maxQuestion:
print("当前微博数以达到%d条" % maxQuestion)
break
return question_link_tag
# 爬虫运行
def spider(driver, keyword, maxQuestion):
# 创建文件
if os.path.exists(book_name_xls):
print("文件已存在")
else:
print("文件不存在,重新创建")
value_title = [ ["rid", "问题", "用户名称", "回答内容", "发布时间", "赞同数"] ]
save.write_excel_xls(book_name_xls, sheet_name_xls, value_title)
user_name = '这里我会输入自己的用户名' # 用户名
user_pass = '这里我会输入自己的密码' # 密码
# 登录后记得退出,不然会到不了登录页面,后续就报错
driver.get('https://www.zhihu.com/signin?next=%2F') # 登录页面
elem = driver.find_elements_by_xpath("//*[@class='SignFlow-tab']")
elem[0].click() # 进入使用密码登录界面
elem = driver.find_elements_by_xpath("//*[@class='SignFlow-accountInput Input-wrapper']/input")
elem[0].send_keys(user_name) # 填充用户名
elem = driver.find_elements_by_xpath("//*[@class='Input-wrapper']/input")
elem[0].send_keys(user_pass) # 填充密码
elem = driver.find_elements_by_xpath("//*[@class='Button SignFlow-submitButton Button--primary Button--blue']")
elem[0].click() # 点击登录按钮
time.sleep(10) # 暂停时间,用于验证
searchUrl = "https://www.zhihu.com/search?type=content&q=" + '普陀山天价便饭' '普陀山天价便饭事件'
elem = driver.get(searchUrl)
print("话题链接获取完毕,休眠2秒")
time.sleep(2)
question_link_tag = get_allQuestions(driver, maxQuestion)
link_list = []
for link in question_link_tag:
link_list.append(link.get_attribute('href'))
for qlink in link_list:
driver.get(qlink) # 进入一个具体的问题页面的全部回答页面
time.sleep(3)
"""
all_q_page = driver.find_elements_by_xpath("//*[@class='QuestionMainAction ViewAll-QuestionMainAction']")
all_q = all_q_page[0].click() # 进入 全部回答 , 注意:及时点掉登录框才能加载出来
time.sleep(3)
"""
answers = driver.find_elements_by_xpath("//*[@class='RichText ztext CopyrightRichText-richText']")
users = driver.find_elements_by_xpath("//*[@class='Avatar AuthorInfo-avatar']")
# print(len(answers), ' ', len(user))
user_proposed = 0
if len(answers) < len(users):
user_proposed = 1
get_allAnswers(driver, user_proposed)
if __name__ == '__main__':
"""
使用这个方法,需在运行程序前打开cmd运行: chrome.exe --remote-debugging-port=9222 --user-data-dir="D:\selenum\AutomationProfile"
"""
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_driver = "" # chromedriver.exe 的存放位置
driver = webdriver.Chrome()
maxQuestion = 4 # 设置最多多少个问题
keywords = ["普陀山天价便饭","普陀山天价便饭事件"] # 列表里填关键词
for keyword in keywords:
spider(driver, keyword, maxQuestion)
|
|