本帖最后由 perlma 于 2024-3-22 11:40 编辑
[Python] 纯文本查看 复制代码 # -*- coding: utf-8 -*-
import os
import random
from datetime import date
import time
from urllib.parse import urljoin
import re
import requests
import shutil
import fitz
from bs4 import BeautifulSoup
def datestr2ymdw(s: str):
"""
日期字串 20240318 -> 2024年3月18日 星期日
Args:
* s(str): "20240318"
"""
year = int(s[:4])
month = int(s[4:6])
day = int(s[6:8])
week = date(year, month, day).weekday()
week_list = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
return f"{year}年{month}月{day}日 {week_list[week]}"
def merge_pdfs(file_list, out_filename):
"""
合并pdf文件, 采用PyMuPDF库
参考: https://blog.csdn.net/winniezhang/article/details/132333475
"""
pdf_merger = fitz.open()
for filename in file_list:
try:
pdf = fitz.open(filename)
pdf_merger.insert_pdf(pdf)
except Exception as e:
print(e)
continue
try:
pdf_merger.save(out_filename)
except Exception as e:
print(e)
pdf_merger.close()
# 吉林日报下载
class Jlrb(object):
def __init__(self, epaper_date: str = None):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 SE 2.X MetaSr 1.0"
}
self.headers = headers
# 电子版html超链接
html_urls = []
if epaper_date is None:
url = "http://jlrbszb.dajilin.com/pc/paper/layout/index.html"
response = requests.get(url, headers=headers)
response.encoding = "UTF-8"
soup = BeautifulSoup(response.text, features="lxml")
# 限定特定节点内容
soup = soup.find(name="ul", attrs={"id": "list"})
allnode_of_a = soup.find_all("a")
result = [_.get("href") for _ in allnode_of_a]
# 生成epaper_date
urlptn = r".*(\d{6})/(\d{2}).*\.html"
m = re.match(urlptn, result[0])
epaper_date = m.group(1) + m.group(2)
for ul in result:
html_urls.append(urljoin(url, ul))
else:
url = "http://jlrbszb.dajilin.com/pc/paper/layout/" + epaper_date[:6] + "/" + epaper_date[6:8] + "/node_01.html"
response = requests.get(url, headers=headers)
response.encoding = "UTF-8"
soup = BeautifulSoup(response.text, features="lxml")
# 限定特定节点内容
soup = soup.find(name="ul", attrs={"id": "layoutlist"})
allnode_of_a = soup.find_all("a")
result = [_.get("href") for _ in allnode_of_a]
for ul in result:
html_urls.append(urljoin(url, ul))
self.pdf_urls = []
for url_temp in html_urls:
response = requests.get(url_temp, headers=headers)
response.encoding = "UTF-8"
soup = BeautifulSoup(response.text, features="lxml")
# 限定特定节点内容
soup = soup.find(name="p", attrs={"id": "pdfUrl"})
# pdf文件超链接
self.pdf_urls.append(urljoin(url_temp, soup.text))
# 合并后的pdf文件名
self.pdf_name = "吉林日报 " + datestr2ymdw(epaper_date)
print(f"[{self.pdf_name}] 共 {len(self.pdf_urls)} 个PDF文件需要合并")
def save_pdf(self, download_path, pdf_href, pdf_detail_name):
resp_download_pdf = requests.get(pdf_href, headers=self.headers)
# 创建文件夹,不存在就创建
path = f"{download_path}/temp_file"
if not os.path.exists(path):
os.mkdir(rf"{download_path}/temp_file")
with open(f"{download_path}/temp_file/{pdf_detail_name}", mode="wb") as f:
f.write(resp_download_pdf.content)
print(f"{pdf_detail_name} 下载完成")
def download_single_pdf_file(self, download_path):
self.pdf_files = []
for url in self.pdf_urls:
num = random.randint(1, 3)
print(f"{url}, 随机暂停时间:{num}秒")
pdf_detail_name = os.path.basename(url)
self.save_pdf(download_path, url, pdf_detail_name)
self.pdf_files.append(f"{download_path}/temp_file/" + pdf_detail_name)
time.sleep(num)
def download(self):
save_path = "E:/downloads/epaper/吉林日报"
self.download_single_pdf_file(save_path)
# 合成绝对路径
file_list = [
os.path.join(f"{save_path}/temp_file", filename) for filename in self.pdf_files
]
out_filename = f"{save_path}/{self.pdf_name}.pdf"
merge_pdfs(file_list, out_filename)
if os.path.exists(f"{save_path}/temp_file"):
shutil.rmtree(f"{save_path}/temp_file")
print(f"下载已完成:{save_path}")
def download_jlrb(epaper_date: str = None):
"""
下载吉林日报电子版
Args:
* epaper_date(str): 20240318
"""
epaper = Jlrb(epaper_date)
epaper.download()
# 下载当日
download_jlrb()
# 特定日期
download_jlrb("20240311")
|