求助各位大佬:爬取知乎热门话题评论讨论的内容运行程序后导出文件内容为空白
各位大佬我是白的不能再白的小白了
因学习需要
要在知乎上爬取某话题讨论并以此进行情感倾向分析
好不容易在网上搜到一个可用的代码
但不知道是自己不会抄作业还是怎么的
导出的文件是空白的
不知道有没有大佬能帮忙看一下
究竟是哪里出了问题?
感恩感恩
# -*- coding: utf-8 -*-
# @file:zhihu-spyder.py
import time
import xlrd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os
import excelSave as save
book_name_xls = "zhihu_data_0820.xls"# 填写你想存放excel的路径,没有文件会自动创建
sheet_name_xls = '知乎数据'# sheet表名
# 用来控制页面滚动
def Transfer_Clicks(browser):
# time.sleep(5)
try:
browser.execute_script("window.scrollBy(0,document.body.scrollHeight)", "")
time.sleep(5)
except:
pass
return "Transfer successfully \n"
def insert_data(driver, answers, user_proposed):
workbook = xlrd.open_workbook(book_name_xls)# 打开工作簿
sheets = workbook.sheet_names()# 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets)# 获取工作簿中所有表格中的的第一个表格
# rows_old = worksheet.nrows# 获取表格中已存在的数据的行数
# rid = rows_old
title_tag = driver.find_elements_by_xpath("//*[@class='QuestionHeader-title']")
question_title = title_tag.text # 问题帖标题
# users = driver.find_elements_by_xpath("//*[@class='UserLink-link']")
users = driver.find_elements_by_xpath("//*[@class='Avatar AuthorInfo-avatar']") # get_attribute
agrees = driver.find_elements_by_xpath("//*[@class='Button VoteButton VoteButton--up']")# get_attribute
times = driver.find_elements_by_xpath("//*[@class='ContentItem-time']/a/span") # get_attribute
# print(len(user), ' ', len(agree), ' ', len(time))
for num in range(0, len(answers)):
u_num = num
if user_proposed == 1:
u_num = num + 1
username = users.get_attribute('alt')
answer_content = answers.text
answer_time = times.get_attribute('data-tooltip')
agree = agrees.get_attribute('aria-label')
value1 = [ ]
print("当前插入第%d条数据" % (num+1))
save.write_excel_xls_append_norepeat(book_name_xls, value1)
# 获取当前页面的数据
def get_allAnswers(driver, user_proposed):
after = 0
n = 0
while True:
before = after
Transfer_Clicks(driver)
# time.sleep(3)
answers = driver.find_elements_by_xpath("//*[@class='RichText ztext CopyrightRichText-richText']")
print("当前包含回答最大数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的回答" % (len(answers), n))
after = len(answers)
if after > before:
n = 0
if after == before:
n = n + 1
if n == 5:
print("当前问题最大回答数为:%d" % after)
insert_data(driver, answers, user_proposed)
break
"""
if len(answers) > maxQuestion:
print("当前回答数以达到%d条" % maxQuestion)
insert_data(driver, answers, user_proposed)
break
"""
def get_allQuestions(driver, maxQuestion):
# 开始爬取数据
after = 0
n = 0
while True:
before = after
Transfer_Clicks(driver)
time.sleep(3)
question_link_tag = driver.find_elements_by_xpath("//*[@class='ContentItem AnswerItem']/h2/div/a")
itemlen = len(question_link_tag)
print("当前包含最大问题数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的条目" % (itemlen, n))
after = itemlen
if after > before:
n = 0
if after == before:
n = n + 1
if n == 5:
print("当前关键词最大条目数为:%d" % after)
break
if itemlen > maxQuestion:
print("当前微博数以达到%d条" % maxQuestion)
break
return question_link_tag
# 爬虫运行
def spider(driver, keyword, maxQuestion):
# 创建文件
if os.path.exists(book_name_xls):
print("文件已存在")
else:
print("文件不存在,重新创建")
value_title = [ ["rid", "问题", "用户名称", "回答内容", "发布时间", "赞同数"] ]
save.write_excel_xls(book_name_xls, sheet_name_xls, value_title)
user_name = '这里我会输入自己的用户名' # 用户名
user_pass = '这里我会输入自己的密码' # 密码
# 登录后记得退出,不然会到不了登录页面,后续就报错
driver.get('https://www.zhihu.com/signin?next=%2F')# 登录页面
elem = driver.find_elements_by_xpath("//*[@class='SignFlow-tab']")
elem.click()# 进入使用密码登录界面
elem = driver.find_elements_by_xpath("//*[@class='SignFlow-accountInput Input-wrapper']/input")
elem.send_keys(user_name)# 填充用户名
elem = driver.find_elements_by_xpath("//*[@class='Input-wrapper']/input")
elem.send_keys(user_pass)# 填充密码
elem = driver.find_elements_by_xpath("//*[@class='Button SignFlow-submitButton Button--primary Button--blue']")
elem.click()# 点击登录按钮
time.sleep(10)# 暂停时间,用于验证
searchUrl = "https://www.zhihu.com/search?type=content&q=" + '普陀山天价便饭' '普陀山天价便饭事件'
elem = driver.get(searchUrl)
print("话题链接获取完毕,休眠2秒")
time.sleep(2)
question_link_tag = get_allQuestions(driver, maxQuestion)
link_list = []
for link in question_link_tag:
link_list.append(link.get_attribute('href'))
for qlink in link_list:
driver.get(qlink)# 进入一个具体的问题页面的全部回答页面
time.sleep(3)
"""
all_q_page = driver.find_elements_by_xpath("//*[@class='QuestionMainAction ViewAll-QuestionMainAction']")
all_q = all_q_page.click()# 进入 全部回答 , 注意:及时点掉登录框才能加载出来
time.sleep(3)
"""
answers = driver.find_elements_by_xpath("//*[@class='RichText ztext CopyrightRichText-richText']")
users = driver.find_elements_by_xpath("//*[@class='Avatar AuthorInfo-avatar']")
# print(len(answers), '', len(user))
user_proposed = 0
if len(answers) < len(users):
user_proposed = 1
get_allAnswers(driver, user_proposed)
if __name__ == '__main__':
"""
使用这个方法,需在运行程序前打开cmd运行: chrome.exe --remote-debugging-port=9222 --user-data-dir="D:\selenum\AutomationProfile"
"""
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_driver = "" # chromedriver.exe 的存放位置
driver = webdriver.Chrome()
maxQuestion = 4# 设置最多多少个问题
keywords = ["普陀山天价便饭","普陀山天价便饭事件"]# 列表里填关键词
for keyword in keywords:
spider(driver, keyword, maxQuestion)
围观一下。。 自己下断点调试呗,先看看有没有数据匹配到
页:
[1]