Python爬取贴吧用户关系网

精灵与法师 发表于 2020-8-25 17:03

看了论坛许多教程后，自己写的第一个爬虫。
请大家请多指教！

爬虫:
1.输入一个用户页面。
2.获取用户页面中的关注，粉丝信息。
3.继续获取关注粉丝的用户页。
4.循环1，直到达到最大迭代层数。
5.用dill保存爬取结果

备注:需要导入百度账户cookies文件

import requests
import re
import json
import time
from lxml import etree
import dill

# 全局变量
tieba_prefix = "http://tieba.baidu.com"
userdict = {}

# 参数信息类
class para:
headers = None
cookies = None
max_loop = None
max_page = None
max_num = None

# 用户信息类
class userinfo(object):
def __init__(self, url):
   self.url = url
   self.id = None
   self.username = None
   self.age = None
   self.tie = None
   self.sex = None
   self.concern_num = None
   self.concern_url = None
   self.concern_list = []
   self.fans_num = None
   self.fans_url = None
   self.fans_list = []

# 保存到文件
def saveToFile(self):
   dictObj = {
         "url": self.url,
         "id": self.id,
         "username": self.username,
         "age": self.age,
         "tie": self.tie,
         "sex": self.sex,
         "concern_num": self.concern_num,
         "concern_url": self.concern_url,
         "fans_num": self.fans_num,
         "fans_url": self.fans_url
   }

# url解析
def getHtmlFromUrl(url, loop_info):

response = requests.get(url, headers=para.headers, cookies=para.cookies)
print('当前页面:' + url)
print(loop_info)
if response.status_code == 200:

   # 很抱歉，您要访问的页面不存在。
   if response.url == 'http://static.tieba.baidu.com/tb/error.html?ErrType=1':
         data = response.content.decode('gbk')# gbk编码
         html = etree.HTML(data)
         result = html.xpath('//div[@id="errorText"]/p/text()')
         if len(result) > 0:
            print(result)
         else:
            print('获取错误消息失败')
         return

   data = response.content.decode('utf-8')
   html = etree.HTML(data)

   # 抱歉，您访问的用户已被屏蔽。
   if response.url == 'http://tieba.baidu.com/tb/static-ucenter/html/error.html':
         result = html.xpath('//div[@id="errorText"]/p/text()')
         if len(result) > 0:
            print(result)
         else:
            print('获取错误消息失败')
         return

   # 正常结果
   return html

else:
   print('页面获取失败')
   print(response.status_code)
   print(response.history)

# 获取用户页面信息
def get_concern_info(html, user, id, loop_info):

# 识别id
if id == '':
   result = html.xpath('//a[@class="nav_icon nav_main"]/@href')
   matchObj = re.search(r'.*?id=(tb.*)', result)
   if matchObj:
         id = matchObj.group(1)
   else:
         print("id No match!!")
         return

# 用户名
username = html.xpath(
   '//span/text()')
# 吧龄
result = html.xpath(
   '//div[@class="userinfo_userdata"]/span/text()')
age = float(result)

# 发帖数
result = html.xpath(
   '//div[@class="userinfo_userdata"]/span/text()')
# 发帖数上万时显示小数
if result[-1] == '万':
   tie = int(float(result) * 10000)
else:
   tie = int(result)

# 性别
sex = html.xpath(
   '//div[@class="userinfo_userdata"]/span/@class')

# 关注数
result = html.xpath(
   '//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/text()'
)
if len(result) > 0:
   concern_num = result

   # 关注页
   result = html.xpath(
         '//ul[@id="concern_wrap_concern"]/..//span[@class="concern_num"]/a/@href'
   )
   concern_url = tieba_prefix + result

# 粉丝数
result = html.xpath(
   '//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/text()'
)
if len(result) > 0:
   fans_num = result

   # 粉丝页
   result = html.xpath(
         '//ul[@id="concern_wrap_fans"]/..//span[@class="concern_num"]/a/@href'
   )
   fans_url = tieba_prefix + result

# 完善用户信息
user.id = id
user.username = username
user.age = age
user.tie = tie
user.sex = sex
# 属性可能不存在
if 'concern_num' in locals():
   user.concern_num = concern_num
   user.concern_url = concern_url

if 'fans_num' in locals():
   user.fans_num = fans_num
   user.fans_url = fans_url

# 用户信息实时导出
#user.saveToFile()

# 追加已获取用户
userdict = user
print('加入用户:' + username)

# 迭代上限检测
if loop_info['Node'] <= para.max_loop:

   #初始化循环信息
   loop_info['Node'] = loop_info['Node'] + 1

   # 关注页迭代
   if concern_url != None:
         loop_info['Origin'] = username + " 关注页"
         loop_concern(concern_url, loop_info, user)
   # 粉丝页迭代
   if fans_url != None:
         loop_info['Origin'] = username + " 粉丝页"
         loop_concern(fans_url, loop_info, user)

# 关注/粉丝页循环但是百度关注显示上限为500
def loop_concern(url, loop_info, user):

# 初始化
loop_info['Page'] = 1

while True:
   # 页面处理
   html = getHtmlFromUrl(url, loop_info)
   # 当前页获取失败时，终止
   if html == None:
         break
   get_concern(html, loop_info, user)

   # 页面获取上限时，终止
   if loop_info['Page'] >= para.max_page:
         break

   #循环信息
   loop_info['Page'] = loop_info['Page'] + 1

   # 下一页url
   result = html.xpath(
         '//div[@class="pager pager-center"]/a[@class="next"]/@href')
   if len(result) > 0:
         url = tieba_prefix + result
   else:
         # 最后一页时退出
         break

# 关注/粉丝页提取
def get_concern(html, loop_info, user):

# 初始化
loop_info['Num'] = 0

pageIdList = html.xpath('//div[@class="user"]/@portrait')
pageUrlList = html.xpath('//span[@class="name"]/a/@href')
# pageUrlList size默认和pageIdList size相等
for i in range(len(pageIdList)):
   u_id = pageIdList
   u_url = tieba_prefix + pageUrlList

   # 获取上限时，终止
   if loop_info['Num'] >= para.max_num:
         break

   #循环信息
   loop_info['Num'] = loop_info['Num'] + 1

   # 已经获取的用户不再重复爬取
   if u_id not in userdict.keys():
         u_html = getHtmlFromUrl(u_url, loop_info)

         # 当前用户获取失败时，跳过
         if u_html == None:
            continue

         # 建立子用户信息
         sub_user = userinfo(u_url)
         get_concern_info(u_html, sub_user, u_id, loop_info)

   #加入到关注/粉丝列表
   if loop_info['Origin'][-3:] == '关注页':
         user.concern_list.append(userdict)
   elif loop_info['Origin'][-3:] == '粉丝页':
         user.fans_list.append(userdict)

def main(max_loop, max_page, max_num, origin_url):

#最大迭代层数
para.max_loop = max_loop

#最大页数
para.max_page = max_page

# 一页最大为20
para.max_num = max_num

# 写入headers
para.headers = {
   'user-agent':
   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
}

# 打开所保存的cookies内容文件
f = open(r'cookies.txt', 'r')

# 初始化cookies
para.cookies = {}

# 写入cookies
for line in f.read().split(';'):

   #其设置为1就会把字符串拆分成2份
   name, value = line.strip().split('=', 1)

   #字典cookies添加
   para.cookies = value

# 初始化循环消息
loop_info = {'Node': 0, 'Page': 0, 'Num': 0, 'Origin': ''}

# 建立用户信息
user = userinfo(origin_url)

# 第一个用户链接
html = getHtmlFromUrl(origin_url, loop_info)
if html == None:
   print("原始输入错误")
   return

# 获取用户信息
get_concern_info(html, user, '', loop_info)

return userdict

if __name__ == '__main__':

origin_url = '贴吧用户url'
main(2, 10, 2, origin_url)

# 保存结果
filename = r"crawler_data.pkl"
dill.dump_session(filename)

print("完成!!")

绘图部分:
1.获取爬取结果。
2.networkx建立有向图。
3.用粉丝数和关注数表示权重。

# 作图
import matplotlib.pyplot as plt
import networkx as nx
import tieba_shejiao
import time
import dill # 加载爬完的数据

user_in_loop = []

'''
返回user1粉丝加上user2关注数
'''
def calc_weight(user1,user2):

fans_num = 0
concern_num = 0

if user1.fans_num:
   fans_num = int(user1.fans_num)
if user2.concern_num:
   concern_num = int(user2.concern_num)
return concern_num + fans_num

def user_Iteration(G, user):
# 加入循环列表
user_in_loop.append(user.id)

# 循环粉丝列表
if len(user.fans_list) > 0:
   for sub_user in user.fans_list[::-1]:

         # 是否是互相关注
         if G.has_edge(user.username, sub_user.username):
            G.remove_edge(user.username, sub_user.username)# 移除原来蓝色的边
            G.add_edge(sub_user.username,
                        user.username,
                        color='r',
                        weight=calc_weight(user,sub_user))# 红色表示互相关注
            G.add_edge(user.username,
                        sub_user.username,
                        color='r',
                        weight=calc_weight(user,sub_user))# 红色表示互相关注
         else:
            G.add_edge(sub_user.username,
                        user.username,
                        color='b',
                        weight=calc_weight(user,sub_user))# 单向关注

         user.fans_list.remove(sub_user)

         # 循环中/循环完的用户不再循环
         if sub_user.id not in user_in_loop:
            user_Iteration(G, sub_user)
# 循环关注列表
if len(user.concern_list) > 0:
   for sub_user in user.concern_list[::-1]:

         # 是否是互相关注
         if G.has_edge(sub_user.username, user.username):
            G.remove_edge(sub_user.username, user.username)# 移除原来蓝色的边
            G.add_edge(user.username,
                        sub_user.username,
                        color='r',
                        weight=calc_weight(sub_user,user))# 红色表示互相关注
            G.add_edge(sub_user.username,
                        user.username,
                        color='r',
                        weight=calc_weight(sub_user,user))# 红色表示互相关注
         else:
            G.add_edge(user.username,
                        sub_user.username,
                        color='b',
                        weight=calc_weight(sub_user,user))# 单向关注

         user.concern_list.remove(sub_user)

         # 循环中/循环完的用户不再循环
         if sub_user.id not in user_in_loop:
            user_Iteration(G, sub_user)

def main():
# 载入数据
filename = r"crawler_data.pkl"
dill.load_session(filename)

G = nx.DiGraph()
for user in userdict.values():
   user_Iteration(G, user)

#
edges = G.edges()
colors = ['color'] for u, v in edges]
weights = ['weight'] for u, v in edges]

# 绘图
nx.draw(
   G,
   pos=nx.spring_layout(G,iterations=10),
   node_color='y',
   edge_color=colors,
   with_labels=True,
   font_size=10,
   node_size=20)

nx.write_gexf(G, "network" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + ".gexf")

plt.show()

if __name__ == '__main__':
main()

结果演示:

画图还是很难，见谅

温故而知新，与大家共勉。

窥屏专用小马甲 发表于 2020-8-26 16:20

好强啊，马克一下。想转型从sql工具人到做用户挖掘，我才练到没有限制的小说网站爬虫，任重道远啊

李欣发表于 2020-8-25 19:56

感谢分享

spiraea 发表于 2020-8-25 20:42

感谢分享，启发一下思路

w2010d 发表于 2020-9-19 13:55

报错是为什么呢...

页: [1]

吾爱破解 - 52pojie.cn's Archiver

Python爬取贴吧用户关系网