好友
阅读权限10
听众
最后登录1970-1-1
|
精灵与法师
发表于 2020-8-25 17:03
看了论坛许多教程后,自己写的第一个爬虫。
请大家请多指教!
爬虫:
1.输入一个用户页面。
2.获取用户页面中的关注,粉丝信息。
3.继续获取关注粉丝的用户页。
4.循环1,直到达到最大迭代层数。
5.用dill保存爬取结果
备注:需要导入百度账户cookies文件
[Python] 纯文本查看 复制代码
import requests
import re
import json
import time
from lxml import etree
import dill
# 全局变量
tieba_prefix = "http://tieba.baidu.com"
userdict = {}
# 参数信息类
class para:
headers = None
cookies = None
max_loop = None
max_page = None
max_num = None
# 用户信息类
class userinfo(object):
def __init__(self, url):
self.url = url
self.id = None
self.username = None
self.age = None
self.tie = None
self.sex = None
self.concern_num = None
self.concern_url = None
self.concern_list = []
self.fans_num = None
self.fans_url = None
self.fans_list = []
# 保存到文件
def saveToFile(self):
dictObj = {
"url": self.url,
"id": self.id,
"username": self.username,
"age": self.age,
"tie": self.tie,
"sex": self.sex,
"concern_num": self.concern_num,
"concern_url": self.concern_url,
"fans_num": self.fans_num,
"fans_url": self.fans_url
}
# url解析
def getHtmlFromUrl(url, loop_info):
response = requests.get(url, headers=para.headers, cookies=para.cookies)
print('当前页面:' + url)
print(loop_info)
if response.status_code == 200:
# 很抱歉,您要访问的页面不存在。
if response.url == 'http://static.tieba.baidu.com/tb/error.html?ErrType=1':
data = response.content.decode('gbk') # gbk编码
html = etree.HTML(data)
result = html.xpath('//div[@id="errorText"]/p/text()')
if len(result) > 0:
print(result[0])
else:
print('获取错误消息失败')
return
data = response.content.decode('utf-8')
html = etree.HTML(data)
# 抱歉,您访问的用户已被屏蔽。
if response.url == 'http://tieba.baidu.com/tb/static-ucenter/html/error.html':
result = html.xpath('//div[@id="errorText"]/p/text()')
if len(result) > 0:
print(result[0])
else:
print('获取错误消息失败')
return
# 正常结果
return html
else:
print('页面获取失败')
print(response.status_code)
print(response.history)
# 获取用户页面信息
def get_concern_info(html, user, id, loop_info):
# 识别id
if id == '':
result = html.xpath('//a[@class="nav_icon nav_main"]/@href')[0]
matchObj = re.search(r'.*?id=(tb.*)', result)
if matchObj:
id = matchObj.group(1)
else:
print("id No match!!")
return
# 用户名
username = html.xpath(
'//span[starts-with(@class,"userinfo_username ")]/text()')[0]
# 吧龄
result = html.xpath(
'//div[@class="userinfo_userdata"]/span[2]/text()')[0][3:-1]
age = float(result)
# 发帖数
result = html.xpath(
'//div[@class="userinfo_userdata"]/span[4]/text()')[0][3:]
# 发帖数上万时显示小数
if result[-1] == '万':
tie = int(float(result[0:-1]) * 10000)
else:
tie = int(result)
# 性别
sex = html.xpath(
'//div[@class="userinfo_userdata"]/span[1]/@class')[0][26:]
# 关注数
result = html.xpath(
'//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/text()'
)
if len(result) > 0:
concern_num = result[0]
# 关注页
result = html.xpath(
'//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/@href'
)
concern_url = tieba_prefix + result[0]
# 粉丝数
result = html.xpath(
'//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/text()'
)
if len(result) > 0:
fans_num = result[0]
# 粉丝页
result = html.xpath(
'//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/@href'
)
fans_url = tieba_prefix + result[0]
# 完善用户信息
user.id = id
user.username = username
user.age = age
user.tie = tie
user.sex = sex
# 属性可能不存在
if 'concern_num' in locals():
user.concern_num = concern_num
user.concern_url = concern_url
if 'fans_num' in locals():
user.fans_num = fans_num
user.fans_url = fans_url
# 用户信息实时导出
#user.saveToFile()
# 追加已获取用户
userdict[id] = user
print('加入用户:' + username)
# 迭代上限检测
if loop_info['Node'] <= para.max_loop:
#初始化循环信息
loop_info['Node'] = loop_info['Node'] + 1
# 关注页迭代
if concern_url != None:
loop_info['Origin'] = username + " 关注页"
loop_concern(concern_url, loop_info, user)
# 粉丝页迭代
if fans_url != None:
loop_info['Origin'] = username + " 粉丝页"
loop_concern(fans_url, loop_info, user)
# 关注/粉丝页循环 但是百度关注显示上限为500
def loop_concern(url, loop_info, user):
# 初始化
loop_info['Page'] = 1
while True:
# 页面处理
html = getHtmlFromUrl(url, loop_info)
# 当前页获取失败时,终止
if html == None:
break
get_concern(html, loop_info, user)
# 页面获取上限时,终止
if loop_info['Page'] >= para.max_page:
break
#循环信息
loop_info['Page'] = loop_info['Page'] + 1
# 下一页url
result = html.xpath(
'//div[@class="pager pager-center"]/a[@class="next"]/@href')
if len(result) > 0:
url = tieba_prefix + result[0]
else:
# 最后一页时退出
break
# 关注/粉丝页提取
def get_concern(html, loop_info, user):
# 初始化
loop_info['Num'] = 0
pageIdList = html.xpath('//div[@class="user"]/@portrait')
pageUrlList = html.xpath('//span[@class="name"]/a/@href')
# pageUrlList size默认和pageIdList size相等
for i in range(len(pageIdList)):
u_id = pageIdList[i]
u_url = tieba_prefix + pageUrlList[i]
# 获取上限时,终止
if loop_info['Num'] >= para.max_num:
break
#循环信息
loop_info['Num'] = loop_info['Num'] + 1
# 已经获取的用户不再重复爬取
if u_id not in userdict.keys():
u_html = getHtmlFromUrl(u_url, loop_info)
# 当前用户获取失败时,跳过
if u_html == None:
continue
# 建立子用户信息
sub_user = userinfo(u_url)
get_concern_info(u_html, sub_user, u_id, loop_info)
#加入到关注/粉丝列表
if loop_info['Origin'][-3:] == '关注页':
user.concern_list.append(userdict[u_id])
elif loop_info['Origin'][-3:] == '粉丝页':
user.fans_list.append(userdict[u_id])
def main(max_loop, max_page, max_num, origin_url):
#最大迭代层数
para.max_loop = max_loop
#最大页数
para.max_page = max_page
# 一页最大为20
para.max_num = max_num
# 写入headers
para.headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}
# 打开所保存的cookies内容文件
f = open(r'cookies.txt', 'r')
# 初始化cookies
para.cookies = {}
# 写入cookies
for line in f.read().split(';'):
#其设置为1就会把字符串拆分成2份
name, value = line.strip().split('=', 1)
#字典cookies添加
para.cookies[name] = value
# 初始化循环消息
loop_info = {'Node': 0, 'Page': 0, 'Num': 0, 'Origin': ''}
# 建立用户信息
user = userinfo(origin_url)
# 第一个用户链接
html = getHtmlFromUrl(origin_url, loop_info)
if html == None:
print("原始输入错误")
return
# 获取用户信息
get_concern_info(html, user, '', loop_info)
return userdict
if __name__ == '__main__':
origin_url = '贴吧用户url'
main(2, 10, 2, origin_url)
# 保存结果
filename = r"crawler_data.pkl"
dill.dump_session(filename)
print("完成!!")
绘图部分:
1.获取爬取结果。
2.networkx建立有向图。
3.用粉丝数和关注数表示权重。
[Python] 纯文本查看 复制代码 # 作图
import matplotlib.pyplot as plt
import networkx as nx
import tieba_shejiao
import time
import dill # 加载爬完的数据
user_in_loop = []
'''
返回user1粉丝加上user2关注数
'''
def calc_weight(user1,user2):
fans_num = 0
concern_num = 0
if user1.fans_num:
fans_num = int(user1.fans_num)
if user2.concern_num:
concern_num = int(user2.concern_num)
return concern_num + fans_num
def user_Iteration(G, user):
# 加入循环列表
user_in_loop.append(user.id)
# 循环粉丝列表
if len(user.fans_list) > 0:
for sub_user in user.fans_list[::-1]:
# 是否是互相关注
if G.has_edge(user.username, sub_user.username):
G.remove_edge(user.username, sub_user.username) # 移除原来蓝色的边
G.add_edge(sub_user.username,
user.username,
color='r',
weight=calc_weight(user,sub_user)) # 红色表示互相关注
G.add_edge(user.username,
sub_user.username,
color='r',
weight=calc_weight(user,sub_user)) # 红色表示互相关注
else:
G.add_edge(sub_user.username,
user.username,
color='b',
weight=calc_weight(user,sub_user)) # 单向关注
user.fans_list.remove(sub_user)
# 循环中/循环完的用户不再循环
if sub_user.id not in user_in_loop:
user_Iteration(G, sub_user)
# 循环关注列表
if len(user.concern_list) > 0:
for sub_user in user.concern_list[::-1]:
# 是否是互相关注
if G.has_edge(sub_user.username, user.username):
G.remove_edge(sub_user.username, user.username) # 移除原来蓝色的边
G.add_edge(user.username,
sub_user.username,
color='r',
weight=calc_weight(sub_user,user)) # 红色表示互相关注
G.add_edge(sub_user.username,
user.username,
color='r',
weight=calc_weight(sub_user,user)) # 红色表示互相关注
else:
G.add_edge(user.username,
sub_user.username,
color='b',
weight=calc_weight(sub_user,user)) # 单向关注
user.concern_list.remove(sub_user)
# 循环中/循环完的用户不再循环
if sub_user.id not in user_in_loop:
user_Iteration(G, sub_user)
def main():
# 载入数据
filename = r"crawler_data.pkl"
dill.load_session(filename)
G = nx.DiGraph()
for user in userdict.values():
user_Iteration(G, user)
#
edges = G.edges()
colors = [G[u][v]['color'] for u, v in edges]
weights = [G[u][v]['weight'] for u, v in edges]
# 绘图
nx.draw(
G,
pos=nx.spring_layout(G,iterations=10),
node_color='y',
edge_color=colors,
with_labels=True,
font_size=10,
node_size=20)
nx.write_gexf(G, "network" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + ".gexf")
plt.show()
if __name__ == '__main__':
main()
结果演示:
画图还是很难,见谅
温故而知新,与大家共勉。 |
免费评分
-
查看全部评分
|