[Python] 纯文本查看 复制代码
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from wordpress_xmlrpc import Client, WordPressPost, WordPressTerm
from wordpress_xmlrpc.methods.posts import NewPost
from wordpress_xmlrpc.methods.media import UploadFile
from wordpress_xmlrpc.methods.taxonomies import NewTerm, GetTerms
from wordpress_xmlrpc.exceptions import InvalidCredentialsError
# WordPress站点信息
host = 'https://www3.nhk.or.jp/news/'
url = "https://www3.nhk.or.jp/news/json16/new_001.json"
# url1 = "https://www3.nhk.or.jp/news/json16/cat04_001.json" # 政治
# url2 = "https://www3.nhk.or.jp/news/json16/cat01_001.json" # 社会
# url3 = "https://www3.nhk.or.jp/news/json16/cat06_001.json" # 国際
# url4 = "https://www3.nhk.or.jp/news/json16/cat08_001.json" # 天気
# url5 = "https://www3.nhk.or.jp/news/json16/cat07_001.json" # スポーツ
# url6 = "https://www3.nhk.or.jp/news/json16/cat05_001.json" # ビジネス
headers = {
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_content = response.json()
print(json_content)
else:
print("Failed to retrieve JSON content")
xmlrpc = 'xmlrpc.php'
username = 'admin'
password = '123456'
host_url_arr = ['https://abc.com/']
category_arr = {
"1": "社会",
"2": "暮らし",
"3": "科学・文化",
"4": "政治",
"5": "ビジネス",
"6": "国際",
"7": "スポーツ",
"8": "天気",
}
category_slug = {
"1": "society",
"2": "life",
"3": "culture",
"4": "politics",
"5": "business",
"6": "international",
"7": "pe",
"8": "weather",
}
if response.status_code == 200:
json_content = response.json()
i = 0
for article in json_content["channel"]["item"]:
host_url = host_url_arr[i]
host_url = host_url + xmlrpc
wordpress_url = host_url + "index.php/wp-json/wp/v2/posts"
# 创建WordPress客户端对象
client = Client(host_url, username, password)
# 获取标题和URL
title = article["title"]
url = article["link"]
imgPath = article['imgPath']
cate = article['cate'] # 分类
category_name = category_arr[cate]
# 创建分类实例
category = WordPressTerm()
category.name = category_name # 设置分类名称
category.taxonomy = 'category' # 设置分类类型
category.slug = category_slug[cate] # 设置分类类型
# 检查'category'(分类)是否存在
terms = client.call(GetTerms('category', {'search': category_name}))
if terms:
print(f"{category.name} exists.")
else:
# 调用创建分类方法
category_id = client.call(NewTerm(category))
# 获取正文
new_url = host + url
content_response = requests.get(new_url)
content_response.encoding = 'utf-8'
content_soup = BeautifulSoup(content_response.content, "html.parser")
content = ""
article_body = content_soup.find("div", {"class": "content--detail-body"})
if article_body is None:
continue
for p in article_body.find_all("p"):
content += p.text.strip() + "\n"
new_img_path = host + imgPath
# 获取图片数据
response1 = requests.get(new_img_path)
response2 = client.call(UploadFile({
"name": "image.jpg", # 文件名称
"type": "image/jpeg", # 文件类型
"bits": response1.content, # 文件数据
}))
thumbnail_id = ''
# 检查响应状态码和内容
if response2:
thumbnail_id = response2['attachment_id']
print(f"文件已成功上传,URL:{response2['url']}")
else:
print("文件上传失败。")
# 获取发布日期
pub_date = article["pubDate"]
date_format = '%a, %d %b %Y %H:%M:%S %z'
# 将日期字符串转换为 datetime 对象
date_obj = datetime.strptime(pub_date, date_format)
# 将 datetime 对象转换为目标格式的日期字符串
new_date_format = '%Y-%m-%dT%H:%M:%S'
new_date_string = date_obj.strftime(new_date_format)
# 输出结果
print(title)
print(content)
print(pub_date)
print(url)
# 文章信息
post_data = {
"title": title,
"content": content,
"date": new_date_string,
"slug": article['id'],
"post_status": "publish", # 发布状态
"thumbnail": thumbnail_id, # 特色图像的媒体 ID
"terms_names": {"category": [category_name]} # 文章所属的分类名称数组
}
wp_post = WordPressPost()
wp_post.title = post_data["title"]
wp_post.content = post_data["content"]
wp_post.date = post_data["date"]
wp_post.slug = post_data["slug"]
wp_post.post_status = post_data.get("post_status", "publish")
wp_post.terms_names = post_data.get("terms_names", {})
wp_post.thumbnail = post_data.get("thumbnail", None)
# 发布文章
try:
post_id = client.call(NewPost(wp_post))
i += 1
if i > 11:
i = 0
print(f"文章'{wp_post.title}'发布成功,ID为{post_id}")
except InvalidCredentialsError:
print("WordPress用户名或密码错误")
except Exception as e:
print(f"发布文章'{wp_post.title}'失败,错误信息:{e}")
else:
print("Failed to retrieve JSON content")