本帖最后由 QingYi. 于 2021-6-19 22:03 编辑
需要用到的东西有 PyMySQL : https://github.com/PyMySQL/PyMySQL
还有peewee :https://github.com/coleifer/peewee
上面的内容需要自行安装
我的环境是mysql + navicat + pycharm
代码中有一些些小问题,我没有解决掉。不过也可以给大家参考了。
其中还有一些优化。例如一些线程上面的。代码结构上面的。希望大家能提供一些意见,看看还有那边我不知道,没有想到的点。好吧。
首先需要创建一下数据库 代码如下:
Answer 回答 我没能拿到,是非常之遗憾。不知道为什么我去访问那个链接,没返回json给我。
[Python] 纯文本查看 复制代码 from peewee import *
db = MySQLDatabase('spider', host='localhost', port=3306, user="root", password="sa")
class BaseModel(Model):
class Meta:
database = db
'''
char has set the max length
can set Text if can't ensure the length
'''
class Topic(BaseModel):
title = CharField(max_length=100)
content = TextField(default="")
id = IntegerField(primary_key=True)
author = CharField(max_length=100)
# creat_time = DateTimeField()
answer_nums = IntegerField(default=0)
click_nums = IntegerField(default=0)
like_nums = IntegerField(default=0)
class Answer(BaseModel):
content = TextField(default="")
topics_id = IntegerField()
author = CharField(max_length=100)
class Author(BaseModel):
name = CharField(max_length=100)
id = CharField(primary_key=True, max_length=100)
clickIn_nums = IntegerField(default=0)
original_num = IntegerField(default=0)
rate = IntegerField(default=-1)
# comments_nums = IntegerField(default=0)
like_nums = IntegerField(default=0)
# introduce = TextField(null=True)
follower_nums = IntegerField(default=0)
if __name__ == '__main__':
db.create_tables([Topic, Answer, Author])
现在上主代码
[Python] 纯文本查看 复制代码 import ast
import re
import time
from threading import Thread
from urllib import parse
import requests
from parsel import Selector
# *****************************************************
from 把上面的包放进来 import *
# *****************************************************
domain = 'https://bbs.csdn.net/'
# 主题的urls
topic_list_urls = []
# topic_list = []
# 作者的list列表,也就是每一个用户
author_list = []
# 提取到的url list
url_list = []
# 最后需要的url
last_url = []
def get_nodes_json():
# 拿到json请求
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
node_str_match = re.search(r"forumNodes: (.*])", left_menu_text)
if node_str_match:
# 将null 替换成python里面 None
node_str = node_str_match.group(1).replace("null", "None")
# 取得所有的链接
node_list = ast.literal_eval(node_str)
return node_list
return []
def process_nodes_list(node_list):
for item in node_list:
if "url" in item:
if item['url']:
url_list.append(item['url'])
if "children" in item:
# 递归
process_nodes_list(item['children'])
def get_level1_list(node_list):
# 这边大家可以print输出 也可以单步debug
level1_url = []
for item in node_list:
if "url" in item and item['url']:
level1_url.append(item['url'])
return level1_url
def get_last_urls():
# ***主要就是靠他去调用上面的函数***
node_list = get_nodes_json()
process_nodes_list(node_list)
# 获取到最后一层的url
level1_url = get_level1_list(node_list)
for url in url_list:
if url not in level1_url:
# 把主链接拼接进来
last_url.append(parse.urljoin(domain, url))
return last_url
# 获取用户的信息的线程
class ParseAuthorThread(Thread):
def run(self):
# 当list is empty的时候,就会出错,把它try 一下
while 1:
try:
url = author_list.pop()
except IndexError as e:
time.sleep(1)
continue
print("开始获取用户:{}".format(url))
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0'
}
resp = requests.get(url, headers=headers).text
sel = Selector(text=resp)
# print(url)
# 后面需要保存
author = Author()
# 这个他其实有两套结果,要分开去处理,我就不处理了,可以输出url,看看出错的那个url和没出错的url 的 html 值有什么不同 好吧 后面的逻辑都要去改
# print(url)
if '<div class="user-profile-head-name"' in resp:
name = sel.xpath("//div[@class='user-profile-head-name']/div/text()").extract()[0]
else:
name = sel.xpath("//span[@class='name ']/text()").extract()[0]
# name = sel.xpath("//div[@class='user-profile-head-name']/div/text()").extract()[0]
# 获取用户的id
id = url.split("/")[-1]
# 获取list列表,方便后面的xpath拼接 都是可以
all_lis = sel.xpath("//div[@class='user-profile-head-info-b']/ul")
click_nums = all_lis.xpath("//li[1]/div[1]/text()").extract()[0]
# 有的牛逼的博主 xx,888 是这样子的,放入数据库 那个逗号不能识别到,所以我们需要进行操作一番
if "," in click_nums:
click_nums = all_lis.xpath("//li[1]/div[1]/text()").extract()[0].replace(",", "")
original_num = all_lis.xpath("//li[2]/a/div[1]/text()").extract()[0]
if "," in original_num:
original_num = all_lis.xpath("//li[2]/a/div[1]/text()").extract()[0].replace(",", "")
# 后面的都是正则了
rate = all_lis.xpath("//li[3]/a/div[1]/text()").extract()[0]
follower_nums = all_lis.xpath("//li[4]/a/div[1]/text()").extract()[0]
if "," in follower_nums:
follower_nums = all_lis.xpath("//li[4]/a/div[1]/text()").extract()[0].replace(",", "")
frame_li = sel.xpath("//div[@class='aside-common-box-content']/ul")
# 有的没有评论数 直接注释掉
# comments_nums = frame_li.xpath('//li[4]/div[1]/span/text()').extract()[0]
# if comments_nums:
# if "," in comments_nums:
# comments_nums = frame_li.xpath('//li[4]/div[1]/span/text()').extract()[0].replace(",", "")
like_nums = frame_li.xpath("//li[3]/div/span/text()").extract()[0]
if "," in like_nums:
like_nums = frame_li.xpath("//li[3]/div/span/text()").extract()[0].replace(",", "")
# introduce = sel.xpath("//div[@class='user-profile-head-introduction']/p/text()").extract()[0]
# introduce = introduce.strip()
# 赋值一下
author.name = name
author.id = id
author.clickIn_nums = click_nums
author.original_num = original_num
author.rate = rate
# author.comments_nums = comments_nums
author.like_nums = like_nums
# author.introduce = introduce
author.follower_nums = follower_nums
# 如果已经存在了,直接保存,不存在就是直接插入 对吗 暴力插入
existed_author = Author.select().where(Author.id == author.id)
if existed_author:
author.save()
else:
author.save(force_insert=True)
# 这里面的逻辑同上
class ParseTopListThread(Thread):
def run(self):
while 1:
try:
url = topic_list_urls.pop()
except IndexError as e:
time.sleep(1)
continue
print("开始获取帖子列表页:{}".format(url))
resp = requests.get(url).text
sel = Selector(text=resp)
all_div = sel.xpath('//div[@class="user-tabs"]/div/div')
for div in all_div:
topic = Topic()
# 一定要这么做 要先判断有没有 才能取值 这里卡了我几十分钟 处理这个问题
if div.xpath('div[1]/div[2]/p/text()').extract():
title = div.xpath('./div[1]/div[2]/p/text()').extract()[0]
topic.title = title
if div.xpath('div[1]/div[2]/div/div[1]/div/div/text()').extract():
content = div.xpath('div[1]/div[2]/div/div[1]/div/div/text()').extract()[0]
topic.content = content
topics_url = parse.urljoin(domain, div.xpath('div[1]/div[3]/div/div/div[3]/a/@href').extract()[0])
author = div.xpath('div[1]/div[1]/a/span/text()').extract()[0]
author_url = div.xpath('div[8]/div[1]/a/@href').extract()[0]
# creat_time_str = div.xpath('div[1]/div[1]/span/span/text()').extract()[0]
# print(author_url)
# creat_time = datetime.strptime(creat_time_str, '%m月%d日')
# nums可能为空
answer_nums = div.xpath('div[1]/div[3]/div/div/div[3]/a/span/text()').extract()[0]
click_nums = div.xpath('div[1]/div[3]/div/div/div[1]/span/text()').extract()[0]
# nums可能为空
like_nums = div.xpath('div[1]/div[3]/div/div/div[2]/span/text()').extract()[0]
# topic.title = title
# topic.content = content
topic.id = int(topics_url.split('/')[-1])
topic.author = author
# topic.creat_time = creat_time
topic.answer_nums = answer_nums
topic.click_nums = click_nums
topic.like_nums = like_nums
existed_topics = Topic.select().where(Topic.id == topic.id)
# peewee.IntegrityError: (1062, "Duplicate entry '600306107' for key 'PRIMARY'")
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
# print(topic.creat_time)
# 添加链接,方便多线程调用
topic_list_urls.append(topics_url)
author_list.append(author_url)
# 这里的json我拿不到 实在是无能为力了,如果可以的话,希望有人能帮到我
# class ParseTopicDetailThread(Thread):
# def run(self):
# while 1:
# url = topic_list.pop()
# print("开始获取帖子:{}".format(url))
if __name__ == '__main__':
last_url = get_last_urls()
for url in last_url:
# 把url添加进去
topic_list_urls.append(url)
# 启动线程
topic_list_thread = ParseTopListThread()
author_list_thread = ParseAuthorThread()
topic_list_thread.start()
author_list_thread.start()
|