[原创源码] 用selenium和requests自动获取百度、B站、微博热榜并推送到企业微信
这几项需要自己补充完整,要不推送不了corpid = ''#企业微信的 corpid
corpsecret = ''#企业微信 corpsecret
appid = ''#企业微信 appid
最近在学Selenium,想起平时看热榜比较多,所以做了这个,每天定时推送百度、B站、微博热榜到企业微信上,不用自己去慢慢找了。本来还写了爬抖音热榜的代码,但不能在微信里直接打开抖音的链接,所以抖音删了。
添加计划任务,可以后台自动推送
推送效果类似这样子
完整代码
import json
from datetime import datetime
from os import path
import emoji
import requests as req
from fake_useragent import UserAgent
from selenium import webdriver
# from urllib.parse import quote_plus#搜索时,中文转英文
corpid = ''#企业微信的 corpid
corpsecret = ''#企业微信 corpsecret
appid = ''#企业微信 appid
tToday = datetime.now().strftime('%H:%M')
send_count = 11#推送n-1条
def filter_str(s):#过滤标题的特殊字符
for i in range(3):
for d in r'!!??.。-_#¥$%&·`、、:;*/\\':
s = s.replace(f'{d}{d}', d)
s = s.replace(' ', '')
return emoji.get_emoji_regexp().sub(r'', s.encode('utf8').decode('utf8'))
def get_with_se(site):#百度和B站都通过selenium获取
ua = UserAgent().random
option = webdriver.ChromeOptions()
#躲过webdriver检测1
option.add_experimental_option('excludeSwitches', ['enable-automation'])
#躲过webdriver检测2
option.add_experimental_option('useAutomationExtension', False)
# 防止selenium错误提示
option.add_experimental_option('excludeSwitches', ['enable-logging'])
option.add_argument('user-agent=' + ua)#随机UA
#加速运行
option.add_argument('--no-sandbox')# 解决DevToolsActivePort文件不存在的报错
option.add_argument('--disable-gpu')# 规避bug
option.add_argument('--hide-scrollbars')# 隐藏滚动条
option.add_argument('blink-settings=imagesEnabled=false')# 不加载图片
option.add_argument('--headless')#隐藏运行
wd = webdriver.Chrome(options=option)
wd.execute_cdp_cmd(
'Page.addScriptToEvaluateOnNewDocument', {
'source':
'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})#躲过webdriver检测3
hot_list = []
if site == 'bili':
url = 'https://www.bilibili.com/v/popular/rank/all'
wd.get(url)
for i in range(1, send_count):
url_cmd = f'//*[@id="app"]/div/div/div/ul/li[{i}]/div/div/a'
url = wd.find_element_by_xpath(url_cmd).get_attribute('href')
title_cmd = f'//*[@id="app"]/div/div/div/ul/li[{i}]/div/div/a'
title = wd.find_element_by_xpath(title_cmd).text
zuozhe_cmd = f'//*[@id="app"]/div/div/div/ul/li[{i}]/div/div/div/a/span'
zuozhe = wd.find_element_by_xpath(zuozhe_cmd).text
hot_list.append()
elif site == 'baidu':
url = 'https://top.baidu.com/board?tab=realtime'
wd.get(url)
for i in range(1, send_count):
title = wd.find_element_by_css_selector(
f'.category-wrap_iQLoo:nth-child({i}) .c-single-text-ellipsis'
).text.strip()
url = f'https://www.baidu.com/s?wd={title}'
hot_list.append()
wd.quit()
return hot_list
def save(file, content):
with open(file, 'w', encoding='gb2312') as f:
f.write(content)
def load(file):
with open(file, 'r', encoding='gb2312') as f:
hot_list = f.readlines()
return
def send_wx(x):
url = f'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid={corpid}&corpsecret={corpsecret}'
r = req.get(url, timeout=5)
tokens = json.loads(r.text)['access_token']
url = "https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=" + tokens
data = {
"touser": "@all",
"msgtype": "text",
"agentid": appid,
"text": {
"content": x
},
"safe": 0,
}
data = json.dumps(data)
return req.post(url, data=data, timeout=9).text
def weibo():
response = req.get("https://weibo.com/ajax/side/hotSearch")
data_json = response.json()['data']['realtime']
n = 1
hot_list = []
for i in data_json:
title = i['note']
url = 'https://s.weibo.com/weibo?q=%23' + i['word'] + '%23'
hot_list.append()
n += 1
if n == send_count: break#满10条就不写入了
return hot_list
def send_top(site_name, hot_list):
if site_name == 'bili':
site = 'B站'
elif site_name == 'baidu':
site = '百度'
elif site_name == 'weibo':
site = '微博'
new_list =
full_hots_list = []
start_num = 1
file = f'E:/Backup/脚本/txt/{site_name}.txt'
if not path.exists(file): save(file, '')#旧热词文件不存在,就新建一个
old_hot = load(file)#获取已存在的热词
for i in hot_list:
title, url = i
title = filter_str(title)#过滤一下标题的特殊字符
full_hots_list.append(title)#标题写入列表,之后再写入文本,方便下次对比
if title not in old_hot:
new_list.append(f'<a href="{url}">{start_num}. {title}</a>')#标记新的
start_num += 1
new_txts = '\n\n'.join(new_list)#连接新热词(排除已存在的老热词)
save(file, '\n'.join(full_hots_list))#存储hotlist,方便下次对比
if len(new_list) > 1:
send_wx(new_txts)
if __name__ == '__main__':
send_top('bili', get_with_se('bili')[:send_count])
send_top('baidu', get_with_se('baidu')[:send_count])
send_top('weibo', weibo()) 本帖最后由 wkdxz 于 2022-6-18 18:01 编辑
小初 发表于 2022-6-18 14:53
在弄个抖音知乎的呗
抖音的热榜做了,因为头条链接无法在微信打开,所以没加上去。知乎日报:https://www.52pojie.cn/thread-1651017-1-1.html 的热榜我有,每天推送一次。
抖音的获取模块,返回一个带关键词和搜索URL的列表
def get_douyin():#微信不能打开抖音连接,不推
r = req.get('https://aweme.snssdk.com/aweme/v1/hot/search/list/',
timeout=5)
obj = json.loads(r.text)
word_list = obj['data']['word_list']
items = list(word_list)
hot_list = []
for i in items:
title = i['word']
url = f'https://www.douyin.com/search/{title}'
hot_list.append()
return hot_list 本帖最后由 wkdxz 于 2022-8-24 14:21 编辑
wangke333 发表于 2022-8-24 14:00
代码有误,大佬检查下
大佬不敢当,给个自用的新版本,需要自建一个Access数据库(可以使用SQL数据库替代,改下连接数据库的代码就可以了),结构如下:
from collections import Counter
from datetime import datetime
from random import sample
from urllib.parse import quote_plus#搜索时,中文转英文
import difflib
import jieba
import json
import pypyodbc
import re
import requests
corpid = ''###########################企业微信 corpid
corpsecret = ''#######################企业微信应用 corpsecret
appid = ''############################企业微信应用 appid
str_now = datetime.now().strftime('%H:%M')
str_month = datetime.now().strftime('%Y-%m')#('%Y-%m-%d')
send_count = 11#推送n-1条
def data(sql, write=False):
hot_data = 'E:/hots.mdb'
conn = pypyodbc.connect(
u'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=' + hot_data)
cursor = conn.cursor()
cursor.execute(sql)
if write:
cursor.commit()# 别忘了立即提交
else:
data = cursor.fetchall()
cursor.close()
return data
def similar_title(title, old_title_list, bili=False):#相似度>n才算是重复标题
if bili:
return False#如果是B站,则不匹配相似度
return any(
difflib.SequenceMatcher(None, title, old_title).quick_ratio() > 0.7
for old_title in old_title_list)
def filter_str(s):#过滤标题的特殊字符
for _ in range(3):
for d in r'!!??.。-_&·`、、:;*/\\':
s = s.replace(f'{d}{d}', d)
for d in r'{}“”【】~●▲▼◆■★':
s = s.replace(d, '')
s = s.replace(' ', '')
return s
def zhong_wen(s):#提取中文字符作为标题关键词
res = re.findall('[\u4e00-\u9fa5A-Za-z0-9.-]', s)
return ''.join(res)
def save(hot_list):
title, url, site_name = hot_list
sql = f"insert into list (标题,网址,来源) values('{title}','{url}','{site_name}')"
data(sql, True)
def recently_hots(isbili=False, days=10):#B站不管日期,只认来源
if isbili:
sql = "select 网址 from list where 来源='bili'"
else:
sql = f"select 标题 from list where 日期>date()-{days}"#默认查询5天前的数据 不管来源
return {i for i in data(sql)}
def send_wx(x):
url = f'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid={corpid}&corpsecret={corpsecret}'
r = requests.get(url, timeout=5)
tokens = r.json()['access_token']
url = f"https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token={tokens}"
data = {
"touser": "@all",
# "touser": 'wuxiaozhi',
"msgtype": "text",
"agentid": appid,
"text": {
"content": x
},
"safe": 0,
}
data = json.dumps(data)
return requests.post(url, data=data, timeout=9).json()
def weibo():
rjson = requests.get("https://weibo.com/ajax/side/hotSearch")
rjson = rjson.json()['data']['realtime']
hot_list = set()
for i in rjson[:send_count]:
if 'is_ad' in i:#过滤广告
continue
title = i['note']
new_title = quote_plus(title) if '%' in title else title#防止%在网址里面转码出错
url = f'https://s.weibo.com/weibo/{new_title}'
hot_list.add((title, url))
return hot_list
def hot_words(title_list):
words = jieba.cut_for_search(title_list)
true_words =
sl = Counter(true_words)
return >= 5]
def baidu():
rjson = requests.get('https://top.baidu.com/board?tab=realtime')
rjson.encoding = 'utf-8'
html = rjson.text
if zhushi_re := re.findall('<!--s-data:(.*false})-->', html, re.S):
txt_json = f'{zhushi_re}'#以文本存储在HTML里面的JSON
datas = json.loads(txt_json)
real_data = datas['data']['cards']['content']
hot_list = set()
for i in real_data[:send_count]:
title = i['word']
if title != '#' and title[-1] != '#':
new_title = quote_plus(
title) if '%' in title else title#防止%在网址里面转码出错
url = f'https://www.baidu.com/s?wd={new_title}'
hot_list.add((title, url))
return hot_list
def all_bili_list(ups):#获取所有UP主的更新
hot_list = set()
for uid in ups:
params = (
('mid', uid),
('ps', '30'),
('tid', '0'),
('pn', '1'),
('keyword', ''),
('order', 'pubdate'),
('jsonp', 'jsonp'),
)
rjson = requests.get('https://api.bilibili.com/x/space/arc/search',
params=params).json()
for i in rjson['data']['list']['vlist']:
play = i['play']#播放
danmu = i['video_review']#弹幕
comment = i['comment']#评论
title = i['title']
bvid = i['bvid']
url = f'https://www.bilibili.com/video/{bvid}'
# 播放>,弹幕>,评论>
if (play > (300 * 10000) or danmu > 3000
or comment > 2000) and url not in recently_hots(True):
hot_list.add((f'{ups}:{title}', url))
return hot_list
def bili():
sql = '''
select top 10 Uid,Nick
from up
where unlike=false
order by rnd(id)
'''#随机抽取10个Up主
ups = dict(data(sql))
hots = all_bili_list(ups)
# for i in hots:
# print(i)
shu = 3#推送条数
return sample(hots, shu) if len(hots) >= shu else hots
def replace_hot_title(title):#热词加括号
hot_list = ' '.join(recently_hots(False, 1))
hwords = hot_words(hot_list)
for i in hwords:
if i in title:
title = title.replace(i, f'({i})')
return title
def send_top_news(site_name, hot_list):
if site_name == 'baidu':
site = '百度'
elif site_name == 'bili':
site = 'B站'
elif site_name == 'weibo':
site = '微博'
new_list =
start_num = 1
old_title_list = recently_hots()#获取已存在的热词
for i in hot_list:
title, url = i
title = filter_str(title)#过滤一下标题的特殊字符
new_title = zhong_wen(title)#提取title中文作为关键词 防止写入时出错
if site_name == 'bili' or not similar_title(
title, old_title_list):#B站的不判断相似标题
save()#标题,网址,来源写入数据库
new_list.append(
f'<a href="{url}">{chr(10101+start_num)} {replace_hot_title(title)}</a>'
)#把热词加上括号
start_num += 1
if len(new_list) > 1:
new_txts = '\n\n'.join(new_list)#连接新热词(排除已存在的老热词)
if not all():
print('企业微信应用的信息未填写完整,不发送,只展示')
print(new_txts)
else:
send_wx(new_txts)
if __name__ == '__main__':
print('爬取 bili ...')
send_top_news('bili', bili())
print('爬取 baidu ...')
send_top_news('baidu', baidu())
print('爬取 weibo ...')
send_top_news('weibo', weibo()) 这是云函数么?
yzqhj 发表于 2022-6-15 15:45
这是云函数么?
不是不是我还没用过云函数 6666666666 wkdxz 发表于 2022-6-15 15:53
不是不是我还没用过云函数
那这个是本地运行的?
yzqhj 发表于 2022-6-15 17:36
那这个是本地运行的?
是的 理论上也可以在服务器运行 只是要改下文件路径 顶了~~~~~~~ 感谢分享 请教下楼主:corpsecret = ''#企业微信 corpsecret,这个corpsecret是不是要新建一个自建的应用,才能生成?是否可以直接调用现在的? hannce 发表于 2022-6-17 23:43
请教下楼主:corpsecret = ''#企业微信 corpsecret,这个corpsecret是不是要新建一个自建的应用,才能生 ...
可以调用现在的,只要是应用都可以。
页:
[1]
2