本帖最后由 judgecx 于 2022-6-12 00:43 编辑
之前写的那个是模拟去复制粘贴的 现在这个是自动去
fiddle 配置教程 自行去看 Fiddler安装与配置以及Fiddler的移动端抓包_LYN-Favorite的博客-CSDN博客_fiddler配置
要自行去下载微信2.9.5.41 版本或者更低版本 论坛里面有 自行搜索 到时候再改良 是这几天时不时就写下 所以很乱 但是也能用
cookie和key的有效期是半个小时左右的 过期后自己再重新去抓取
使用教程 http://101.33.206.206/20220611214013.zip 下载zip包 自己解压了 自己看
获取公众号首页的十来篇文章
[Python] 纯文本查看 复制代码 import requests
import pypandoc
import time
import pdfkit
from lxml import etree
#获取首页标题并保存内容
headers = {
"Host": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Linux; Android 10; WLZ-AN00 Build/HUAWEIWLZ-AN00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/86.0.4240.99 XWEB/3225 MMWEBSDK/201201 Mobile Safari/537.36 MMWEBID/5471 MicroMessenger/7.0.22.1820(0x2700163B) Process/toolsmp WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64",
"Cookie": "wxuin=734864449; devicetype=Windows10x64; version=62090529; lang=zh_CN; pass_ticket=OCywUL8u3cNrUaTK+dUaeSmss3ENFAVB3xHg9zR3yGVcmDcllKVLYnfqJDQdqviE; wap_sid2=CMHItN4CEooBeV9IQWhfOUtvR1F2bkl2Ml8tUkNvSVh6VC1kRmlsTFBtZXVVVFI4bjVaN1g3cDY1RnRtajFiTkx6d3pIQ3R2TGV0b0ViZzRsWjlVWVJUd1NrMVc4Um1La1RhU2xmd1lKTW5CNGVvZWJIeVhYT0RTMk5KNnB1N19YanBjNlhpd0NjcTgyOFNBQUF+MLu0kpUGOA1AlU4="
}
def ge(url):
r = requests.get(url,headers=headers)
#url='https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzAwMzYxNzc1OA==&scene=124#wechat_redirect'
#r = requests.get(url,headers=headers)
#print(r.text)
rc = len(r.text.split("{"title":""))
for i in range(1,rc):
#r_title = r.text.split("{"title":"")[i].split("&")[0]
r_url = r.text.split("content_url":"")[i].split("","source_url")[0]
#print(r_title,r_url)
with open('00111.txt', 'a+') as f:
f.write(r_url+'\n')
with open("urls.txt",'r') as f:
urls = f.readlines()
for url in urls:
ge(url)
def ge1(url1):
res = requests.get(url1)
a =res.text
etree_html = etree.HTML(res.text)
#print(etree_html)
content = etree_html.xpath('//*[@id="activity-name"]/text()')#获取标题
#print(content)
for each in content:#去除标题多余的字符
replace = each.replace('\n', '').replace(' ', '')
if replace == '\n' or replace== '':
continue
aa= replace
print(aa)
rule = r'?*:“”"<>\/|--()~`.《》·、,。?&……%¥#@!!$^_=+'#去除不能命名的字符
for c in rule:
aa = aa.replace(c,"")
with open(aa+'.html', 'a+',encoding='utf-8') as f:
f.write(a)
#创建一个变量存储原字符
datasrc = "data-src"
# 创建一个变量存储要修改的字符
src = "src"
#只读模式打开文件
with open(aa+'.html', 'r',encoding='UTF-8') as file:
# 使用 read() 函数读取文件内容并将它们存储在一个新变量中
data = file.read()
# 使用 replace() 函数搜索和替换文本
data = data.replace(datasrc, src)
# 以只写模式打开我们的文本文件以写入替换的内容
with open(aa+'.html', 'w',encoding='UTF-8') as file:
# 在我们的文本文件中写入替换的数据
file.write(data)
time.sleep(2)
with open('00111.txt', 'r',encoding='gbk') as f:
line = f.readlines() # 读取文件
line = line[1:] # 只读取第一行之后的内容
f = open('00111.txt', mode='w', encoding='gbk') # 以写入的形式打开txt文件
f.writelines(line) # 将修改后的文本内容写入
with open("00111.txt",'r') as f:
urls = f.readlines()
for url1 in urls:
ge1(url1)
获取公众号其他页面的
[Python] 纯文本查看 复制代码 import requests
import pypandoc
import time
import pdfkit
from lxml import etree
import json
#获取其他页面并保存内容
u = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzAwMzYxNzc1OA==&f=json&offset='
for x in range(10,20,10):
url = str(u)+str(x)+"&count=10&is_ok=1&scene=124&uin=NzM0ODY0NDQ5&key=2dfcb79daaf542fa15b98469da8929fa378f4f9e225fa10e509d5998a5bcc2c81715e3031a54899b6a98cdc26adeeb825aae3c891b6a736333f0001825d9c34026b436333a580c60b8193c91a48b02dc8f3751aaf46d8c51f42c1dfe0ea2553ee5554a98f9e61d041b98ad2af7708b0aad947afea48fa366ac762526d5d84f48"
aa = url
#print(aa)
a1 = requests.get(aa)
rc1 = len(a1.text.split('''\\"title\\":\\"'''))
for i1 in range(1,rc1):
r_title1 = a1.text.split('''\\"title\\":\\"''')[i1].split('''\\",''')[0]
r_url3 = a1.text.split('''"content_url\\":\\"''')[i1].split('''\\",\\"source_url''')[0]
#print(r_title1,r_url3)
rule1 = "\\"
for c1 in rule1:
r_url3 = r_url3.replace(c1,"")
aa2 = r_url3
print(aa2)
with open('ymbt.txt', 'a+') as f:
f.write(aa2+'\n')
time.sleep(1)
def ge1(url1):
res = requests.get(url1)
a =res.text
etree_html = etree.HTML(res.text)
#print(etree_html)
content = etree_html.xpath('//*[@id="activity-name"]/text()')#获取标题
#print(content)
for each in content:#去除标题多余的字符
replace = each.replace('\n', '').replace(' ', '')
if replace == '\n' or replace== '':
continue
aa1= replace
print(aa1)
rule = r'?*:“”"<>\/|--()~`.《》·、,。?&……%¥#@!!$^_=+'#去除不能命名的字符
for c in rule:
aa1 = aa1.replace(c,"")
with open(aa1+'.html', 'a+',encoding='utf-8') as f:
f.write(a)
#创建一个变量存储原字符
datasrc = "data-src"
# 创建一个变量存储要修改的字符
src = "src"
#只读模式打开文件
with open(aa1+'.html', 'r',encoding='UTF-8') as file:
# 使用 read() 函数读取文件内容并将它们存储在一个新变量中
data = file.read()
# 使用 replace() 函数搜索和替换文本
data = data.replace(datasrc, src)
# 以只写模式打开我们的文本文件以写入替换的内容
with open(aa1+'.html', 'w+',encoding='UTF-8') as file:
# 在我们的文本文件中写入替换的数据
file.write(data)
time.sleep(2)
with open('ymbt.txt', 'r',encoding='gbk') as f:
line = f.readlines() # 读取文件
line = line[1:] # 只读取第一行之后的内容
f = open('ymbt.txt', mode='w', encoding='gbk') # 以写入的形式打开txt文件
f.writelines(line) # 将修改后的文本内容写入
with open("ymbt.txt",'r') as f:
urls = f.readlines()
for url1 in urls:
ge1(url1) |