[Python] 纯文本查看 复制代码
#!/usr/bin/evn python
# -*- coding: utf-8 -*-
from urllib import quote,unquote
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
from time import sleep
import re
import codecs
class Weibo(object):
def __init__(self,url=""):
self.url = url
#firefoxoption = webdriver.FirefoxOptions()
#firefoxoption.set_headless()
#self.browser = webdriver.Firefox(firefox_options=firefoxoption)
print("正在打开浏览器,请稍后...")
self.browser = webdriver.Firefox()
def GetPageSource(self,url,mate,callback):
print("正在打开网页,请稍后...")
self.browser.get(url)
wait = WebDriverWait(self.browser,10)
userInfo = wait.until(EC.presence_of_element_located(mate))
return callback(self.browser.page_source)
def GetUserList(self,url):
print ("正在获得找人页面所有匹配到的信息,请稍后...")
retUserList = []
bs = BeautifulSoup(url,"lxml")
userList = bs.select("#pl_user_feedList .list_person")
for user in userList:
userInfo = {
"nickName":user.select(".person_name")[0].a['title'],
"mainPage":"https:" + user.select(".person_name")[0].a['href'],
"Address":user.select(".person_addr > span:nth-of-type(2)")[0].get_text(),
"Card": user.select(".person_card")[0].get_text(strip=True) if user.select(".person_card") else "",
"Num": " ".join(user.select(".person_num")[0].get_text().lstrip().split("\n")),
"PersonInfo":re.sub("[\t\n]","",user.select(".person_info")[0].get_text())
}
retUserList.append(userInfo)
return retUserList
def GetPersonPageContent(self,url):
print("正在或者个人页面信息,请稍后")
bs = BeautifulSoup(url,"lxml")
contentList = bs.select("div[node-type='feed_list'] > div[action-type='feed_list_item']")
retPersonInfoList = []
for i in xrange(len(contentList)) :
try:
contentInfo = {
"id": str(i+1),
"from":contentList[i].select(".WB_from")[0].get_text(strip=True),
"text":contentList[i].select(".WB_text.W_f14")[0].get_text(strip=True),
"videoOrImg":self.GetImageOrVideoPath(contentList[i].select(".WB_media_wrap")[0]) if contentList[i].select(".WB_media_wrap") else ""
}
retPersonInfoList.append(contentInfo)
except:
continue
return retPersonInfoList
def GetImageOrVideoPath(self,source):
media = source.select(".WB_media_a")[0]
url = media.select(".WB_video")
if url:
videoPath = unquote(unquote(url[0]["video-sources"][8:]))
return videoPath
else:
try:
actionData = media["action-data"]
if actionData :
if "pic_ids" in actionData:
data = re.search("clear_picSrc=(.*?)&", actionData)
imageList = [ "https:%s"%(unquote(img)) for img in data.group(1).split(",")]
return ",".join(imageList)
else:
data = re.search("clear_picSrc=(.*?)$", actionData)
return "https:" + unquote(data.group(1))
except KeyError as e:
imagePath = media.select(".WB_pic")[0].a.img["src"]
return imagePath
def SavePersonInfo(self,filename,content):
with codecs.open("./%s.json" % filename, "w+", "utf-8") as f:
for i in content:
f.write(json.dumps(i) + "\n")
def run(self,url):
userList = self.GetPageSource(url,(By.ID,"pl_user_feedList"),self.GetUserList)
if userList:
for i in xrange(len(userList)) :
print ("%d:\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n"%(i+1,userList[i]["nickName"],userList[i]["mainPage"],userList[i]["Address"],userList[i]["Card"],userList[i]["Num"],userList[i]["PersonInfo"]))
else:
return -1
while True:
try:
inputcontent = int(raw_input("请在上面输出的内容中选择需要的选项 1-%d: "%len(userList)))
if inputcontent > 0 and inputcontent <= len(userList):
break
print("请输入数字的范围 1 - %d "%len(userList))
except:
print("请输入数字的范围 1 - %d "%len(userList))
continue
self.browser.execute_script("window.open()")
self.browser.switch_to_window(self.browser.window_handles[1])
userInfo = self.GetPageSource(userList[inputcontent-1]["mainPage"],(By.CSS_SELECTOR,"div[node-type='feed_list']"),self.GetPersonPageContent)
if userInfo :
self.SavePersonInfo(userList[inputcontent-1]["nickName"],userInfo)
def __del__(self):
if self.browser.window_handles:
for hand in self.browser.window_handles:
self.browser.switch_to_window(hand)
self.browser.close()
def main():
name= raw_input("请输入需要搜索的名字 : ")
name = quote(quote(name))
url ="http://s.weibo.com/user/%s&Refer=index"%name
weiboret = Weibo()
weiboret.run(url)
if __name__ == '__main__':
main()