用Python实现微信公众号API素材库图文消息抓取

Dlan 发表于 2018-6-8 19:32

本帖最后由 Dlan 于 2018-6-8 19:35 编辑

起因：
新广告法规定，不得出现最、国际首发等极限词，因公众号发文上千，手动去查太慢了，因此对接公众号的API实现，抓取公众号的所有图文消息，自动检查极限词和抓取所有图片。
#coding:utf-8
import werobot
import pymongo

class Gongzhonghao():

def __init__(self,token,APP_ID,ENCODING_AES_KEY,APP_SECRET):
   self.robot = werobot.WeRoBot(token = token)
   self.robot.config['HOST'] = '0.0.0.0'
   self.robot.config['PORT'] = 80
   self.robot.config['APP_ID'] = APP_ID
   self.robot.config['ENCODING_AES_KEY'] = ENCODING_AES_KEY
   self.robot.config['APP_SECRET'] = APP_SECRET

def _getNews_Count(self):
   """
   获取公众号图文消息总数
   :return: Int
   """
   mediacount = self.robot.client.get_media_count()
   news_count = mediacount['news_count']
   return news_count

def getNews(self):
   """
   获取公众号所有的图文内容
   :return: Json
   """
   i = 0
   items = []
   news_count = self._getNews_Count()
   while i < news_count:
         tempj = self.robot.client.get_media_list('news', i, 20)
         items= tempj['item'] + items
         i = i + 20
   j = {
         'total_count': news_count,
         'items': items
   }
   return j

def echo(self):
   """
   用于公众号后台初次配置的验证
   :return: null
   """
   self.robot.run()

if __name__ == '__main__':
g = Gongzhonghao('1', '2', '3','4')
j = g.getNews()
client = pymongo.MongoClient('ip', 27017)
db = client.gongzhonghao
xxx= db.xxx
xxx.insert(j)

然后连接数据库进行解析，数据库中包含图文消息html代码等信息。
# -*- coding:utf-8 -*-

import os
import urllib.parse
from html.parser import HTMLParser

import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

class ContentHtmlParser(HTMLParser):
"""
过滤html标签
"""

def __init__(self):
   HTMLParser.__init__(self)
   self.text = ""

def handle_data(self, data):
   self.text += data

def get_text(self):
   return self.text

mongo_client = MongoClient("ip", 27017)
mongo_db = mongo_client["gongzhonghao"]

def get_words():
words = []
with open("words.txt", encoding="utf-8") as words_file:
   for lines in words_file.readlines():
         if len(lines.strip()) == 0:
            continue

         if lines.find("、") != -1:
            for p in lines.split("、"):
               words.append(p.replace("\n", ""))
         else:
            words.append(lines.replace("\n", ""))
return words

def get_articles(clt):
articles = []

collection = mongo_db
doc = collection.find_one()
items = doc["items"]
for it in items:
   content = it["content"]["news_item"]
   articles.append(content)

return articles

def download(dir, file_name, url):
if not os.path.exists(dir):
   os.mkdir(dir)

try:
   resp = requests.get(url)

   path = dir + "\\" + file_name

   if os.path.exists(path):
         return

   with open(path, "wb") as f:
         f.write(resp.content)
except :
   print(url)

def find_images(content):
imgs = []
c = urllib.parse.unquote(content)
img_labels = BeautifulSoup(c, "html.parser").find_all("img")
for img in img_labels:
   src = img.get("data-src")
   imgs.append(src)
return imgs

def get_suffix(url):
try:
   suffix = url
   if suffix == "jpeg" or suffix == "other":
         return ".jpg"
   return "." + suffix
except:
   return ".jpg"

def filter_content(content):
parser = ContentHtmlParser()
parser.feed(content)
return parser.get_text()

def check_jinyongci(content):
fc = filter_content(content)
words = get_words()
invalids = []
for w in words:
   if fc.find(w) != -1:
         invalids.append(w)
return invalids

def save_jinyongci(clt, title, invalids):
if len(invalids) == 0:
   return

file = clt + "\\invalid.txt"

with open(file, "a+",encoding="utf-8") as f:
   f.write("标题：" + title)
   f.write("\r\n敏感词：")

   for iv in invalids:
         f.write(iv)
         f.write("、")

   f.write("\r\n\r\n")

if __name__ == "__main__":
clt = "xxx"

if not os.path.exists(clt):
   os.mkdir(clt)

articles = get_articles(clt)
print(clt + ": 共" + str(len(articles)) + "个")

for i in range(0, len(articles)):
   print("正在处理第 " + str(i) + " 个")

   title = articles["title"]
   thumb_url = articles["thumb_url"]
   content = articles["content"]

   # 下载封面
   # path = os.path.join(clt, title)
   fname = str(i) + "_" + title.replace("|", "").replace("<", "").replace(">", "")
   download(clt, fname + get_suffix(thumb_url), thumb_url)

   # 找出文章中的图片
   imgs = find_images(content)
   index = 0
   for img in imgs:
         download(clt, fname + "_" + str(index) + get_suffix(img), img)
         index = index + 1

   # 找出文章中的敏感词
   invalids = check_jinyongci(content)
   print(invalids,'----',title)
   save_jinyongci(clt, title, invalids)

附带极限词列表

最大程度、最高级、最高端、最奢侈、最低级、最便宜、史上最低价、最流行、最受欢迎、最先进科学、最新技术、最新科学

中国第一、全网第一、销量第一、排名第一、第一品牌、NO.1、TOP1、独一无二、全国第一、最后一波、大品牌之一、销冠

国家级、国际级、世界级、千万级、百万级、星级、5A、甲级、超甲级

顶级、尖端、顶尖、顶级享受、完美、至尊、空前、绝后、绝版、非此莫属、巅峰、前所未有、完美、翘楚之作、不可再生、不可复制、绝无仅有、寸土寸金、淋漓尽致、无与伦比、唯一、卓越

前无古人后无来者、绝版、珍稀、臻稀、稀少、绝无仅有、绝不在有、稀世珍宝、千金难求、世所罕见、不可多得、空前绝后、寥寥无几、屈指可数

独家、独创、独据、开发者、缔造者、创始者、发明者

首个、首选、独家、首发、首席、首府、首选、首屈一指、全国首家、国家领导人、国门、国宅、首次、填补国内空白、国际品质

大牌、金牌、名牌、王牌、领先上市、巨星、著名、掌门人、至尊、冠军

世界领先、领先、领导者、领袖、引领、创领、领航、耀领

史无前例、前无古人、永久、万能、百分之百

Dlan 发表于 2019-7-30 17:58

狗汪汪发表于 2018-8-18 14:47
我确定是我。。。。
- -我还想问你是哪位。。。

大小蛋

Dlan 发表于 2018-8-21 09:45

狗汪汪发表于 2018-8-18 14:47
我确定是我。。。。
- -我还想问你是哪位。。。

那我说个谜语，你对了就是你，大小打一字。

你好吃干脆面吗 发表于 2018-6-8 19:47

{:1_921:}很强大、支持了。科技改变生活

mirc 发表于 2018-6-8 20:07

怎么使用~~~~~~~~~~~~

zgydsy 发表于 2018-6-8 20:11

感谢分享

Miii 发表于 2018-6-8 20:14

厉害厉害了！支持支持……

byebye1000 发表于 2018-6-8 20:21

我回贴，先顶再看。顶住

Slopr 发表于 2018-6-8 20:29

学习学习

狗汪汪 发表于 2018-8-16 17:19

哥。。。。你能把头像换了么。。。
那张脸是我。。。

Dlan 发表于 2018-8-17 09:47

狗汪汪发表于 2018-8-16 17:19
哥。。。。你能把头像换了么。。。
那张脸是我。。。

你确定是你啊

rickyaimar 发表于 2018-8-17 13:56

极限词nb，小白，还需要找到API。。。{:17_1062:}

页: [1] 2

吾爱破解 - 52pojie.cn's Archiver

用Python实现微信公众号API素材库图文消息抓取