本帖最后由 iprogramer 于 2024-6-26 22:39 编辑
[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import os
import re
import base64
import logging
html_template = """
<!DOCTYPE html>
<html lang="en">
<title>{title}</title>
<head>
<meta charset="UTF-8">
<style>{style}
</style>
</head>
<body class='container'>
{content}
</body>
</html>
"""
style = """
body {
font-family: Arial, sans-serif;
}
.container {
width: 700px;
margin: 0 auto; /* 居中 */
margin-top:30px;
}
.container img { /* 添加这一段来让图片水平居中 */
display: block; /* 将图片视为块级元素以便应用margin */
margin: 0 auto; /* 实现水平居中 */
max-width: 60%; /* 确保图片不超过容器宽度 */
height: auto; /* 保持图片原始宽高比 */
}
.container h1 {
font-size:36px;
margin: 0 auto; /* 居中 */
}
"""
def setup_logger():
#设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class WebScraper:
def __init__(self, start_id=0):
self.start_id = start_id
self.last_id_path = "last_id.txt"
def load_last_id(self):
"""加载上次处理的ID"""
if os.path.exists(self.last_id_path):
with open(self.last_id_path, 'r') as file:
return int(file.read().strip())
return self.start_id
@staticmethod
def save_last_id(last_id):
#保存已处理的ID
with open("last_id.txt", "w") as file:
file.write(str(last_id))
@staticmethod
def make_request(url):
#发送网络请求,增加异常处理
try:
response = requests.get(url)
response.raise_for_status() # 抛出HTTP错误
return response
except requests.RequestException as e:
print(f"请求错误: {e}")
return None
@staticmethod
def sanitize_filename(file_name):
#清理文件名中的非法字符
return re.sub(r'[^\w\s-]', '', file_name).strip().replace(' ', '_')
def parse_webpage(self, response):
#解析网页内容,提取标题、正文、时间等
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1').text.strip()
article_body = soup.find(class_='article-body')
pub_date_tag = soup.find(class_='time')
pub_date = pub_date_tag.text.strip().replace("-", "") if pub_date_tag else ""
return title, article_body, pub_date
def download_and_encode_image(self, img_url):
#下载图片并转换为Base64编码
response = self.make_request(img_url)
if response:
return base64.b64encode(response.content).decode('utf-8')
return None
def embed_images_in_html(self, soup, images_base64):
#将图片Base64编码嵌入到HTML中
for img in soup.find_all('img'):
img_src = img.get('src')
if img_src and img_src.startswith(('http:', 'https:')):
base64_img = images_base64.get(img_src)
if base64_img:
img['src'] = base64_img
def scrape_and_process(self, url):
#整合所有步骤处理单个网页
response = self.make_request(url)
if not response:
return
title, article_body, pub_date = self.parse_webpage(response)
title_sanitized = self.sanitize_filename(title)
images_base64 = {img.get('src'): self.download_and_encode_image(img.get('src'))
for img in article_body.find_all('img')
if img.get('src').startswith(('http:', 'https:'))}
self.embed_images_in_html(article_body, images_base64)
html_body = html_template.format(title=title_sanitized, content=str(article_body), style=style)
author = '安全内参'
file_name = f"[{author}] - {pub_date} - {title_sanitized}.html"
year, month, day = pub_date[:4], pub_date[4:6], pub_date[6:8]
folder_to_save="D:\\微信文件"
target_dir = f"{folder_to_save}\{author}\{year}\{month}\{day}"
os.makedirs(target_dir, exist_ok=True) # 使用exist_ok避免检查目录是否存在
f_name = os.path.join(target_dir, file_name)
try:
with open(f_name, 'w', encoding='utf-8') as file:
file.write(html_body)
logging.info(f'文章已成功保存至:{f_name}')
except IOError as e:
logging.error(f'保存文件时发生错误:{e}')
def run(self):
#主循环,处理一系列网页
last_id = self.load_last_id()
for i in range(last_id, 90000):
url = f'https://www.secrss.com/articles/{i}'
print(f"正在处理:{url}")
self.scrape_and_process(url)
self.save_last_id(i + 1)
if __name__ == '__main__':
setup_logger()
scraper = WebScraper()
scraper.run() |