python:湖南日报pdf下载合并代码
输入8位数字的日期,可以下载各个版面的pdf,最后合成一个pdf文件。import os
import shutil
import requests
from PyPDF2 import PdfMerger
# 函数:从URL下载PDF文件
def download_pdf_from_url(url, filename, i):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110'
}
response = requests.get(url, stream=True, headers=headers)
if response.status_code == 200:
print(i, end=' ')
with open(filename, 'wb') as f:
for chunk in response.iter_content(1024):
if chunk:# 确保只写入非空内容
f.write(chunk)
else:
print('.', end='')
# print(f"Failed to download PDF. Status code: {response.status_code}")
# 函数:合并文件夹中的PDF文件
def merge_pdfs_in_folder(folder_path, output_filename):
merger = PdfMerger()
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
file_path = os.path.join(folder_path, filename)
merger.append(file_path)
with open(output_filename, 'wb') as outfile:
merger.write(outfile)
merger.close()
print(f"Merged PDF saved as {output_filename}")
# 主程序
def main():
# 用户输入日期
date_input = input('请输入形如20240417这样的8位数日期:')
if not date_input.isdigit() or len(date_input) != 8:
print("输入的日期格式不正确,请确保是8位数的数字!")
return
# 创建下载文件夹
pdf_folder = 'd:/hnrb/temp'
os.makedirs(pdf_folder, exist_ok=True)
# 解析日期
year_month = f'{date_input[:4]}-{date_input}'
day = f'{date_input}'
# 下载PDF文件
for i in range(1, 99):
ban = f'{i:02d}'# 使用格式化字符串确保两位数
pdf_url = f'https://hnrb.voc.com.cn/hnrb_epaper/images/{year_month}/{day}/{ban}/{date_input}{ban}_pdf.pdf'
pdf_filename = os.path.join(pdf_folder, f'pdf_{date_input}{ban}.pdf')
try:
download_pdf_from_url(pdf_url, pdf_filename, i)
except Exception as e:
print(f"Failed to download {pdf_filename}: {e}")
print()
# 合并PDF文件
output_pdf_filename = f'd:/hnrb/hnrb_{date_input}.pdf'
merge_pdfs_in_folder(pdf_folder, output_pdf_filename)
# 删除下载文件夹
shutil.rmtree(pdf_folder)
print('任务完成!')
# 执行主程序
if __name__ == '__main__':
main()
开始学习python 已经使用上了,需要PyPDF2,很棒,是否可以加一个下载进度? 感谢分享,开始学习Python. 可以的666 学习一下 网站地址发一个 yysyWang 发表于 2024-4-17 16:46
网站地址发一个
https://epaper.voc.com.cn/hnrb/html/2024-04/17/node_201.htm
页:
[1]