代码修改自本站duskdust的一个简单的针对扫描版pdf压缩的软件 ,原帖地址https://www.52pojie.cn/forum.php?mod=viewthread&tid=1904973&highlight=pdf%D1%B9%CB%F5。
全部用kimi写的,自己没学过Python{:1_907:},添加了多文件多线程同时处理功能,还有设置图片压缩质量。直接py运行没问题,但是编译成exe,用的pyinstaller cpdf.py --onefile,会无限叫你选择文件序号和功能选择
[Asm] 纯文本查看 复制代码 # 导入必要的库
import os
import concurrent.futures
from pypdf import PdfReader, PdfWriter
from tqdm import tqdm
from PIL import Image, ImageEnhance
from io import BytesIO
# 将图片转换为黑白,并增强对比度、亮度、锐度和色彩
def blacky(im):
# 转换图片为灰度模式
im = im.convert('L')
# 增强对比度
im = ImageEnhance.Contrast(im).enhance(3)
# 增强亮度
im = ImageEnhance.Brightness(im).enhance(1.5)
# 增强锐度
im = ImageEnhance.Sharpness(im).enhance(2)
# 增强色彩
im = ImageEnhance.Color(im).enhance(1.5)
# 设置阈值
threshold = 128
# 创建阈值表
table = [0 if i < threshold else 1 for i in range(256)]
# 应用阈值表,生成二值图像
new_image = im.point(table, '1')
# 将二值图像保存到内存中的BytesIO对象
imgbuffer = BytesIO()
new_image.save(imgbuffer, format="TIFF", compression='group4', optimize=True, dpi=[300, 300])
# 从BytesIO对象重新打开图像
return Image.open(imgbuffer)
# 根据间接引用获取PDF页面编号
def get_page_number_from_indirect(reader, indirect_reference):
for i, page in enumerate(reader.pages):
if page.indirect_reference == indirect_reference:
return i
return None
# 将书签添加到PDF写入器
def add_bookmarks_to_writer(writer, reader, outlines, parent=None):
# 初始化parents列表,用于记录父书签
parents = []
for item in outlines:
if isinstance(item, list):
# 如果书签有子项(嵌套的书签),递归处理
add_bookmarks_to_writer(writer, reader, item, parent=parent)
# 将当前父书签添加到parents列表中
parents.append(parent)
else:
# 获取书签标题和页面间接引用
title = item.get('/Title')
indirect_reference = item.get('/Page')
# 根据间接引用获取页面编号
page_num = get_page_number_from_indirect(reader, indirect_reference)
# 如果书签有子项,创建新的父书签
if '/Count' in item and item['/Count'] < 0:
new_parent = writer.add_outline_item(title, page_num, parent=parent)
parents.append(new_parent)
else:
# 添加书签项
writer.add_outline_item(title, page_num, parent=parent)
# 处理单个PDF文件的函数
def process_pdf(file_index, choice, pdf_file, quality=None):
try:
# 读取PDF文件
reader = PdfReader(pdf_file)
writer = PdfWriter()
# 将PDF的每一页添加到写入器
for page in tqdm(reader.pages, desc=f"Processing {pdf_file}"):
writer.add_page(page)
# 根据用户选择执行不同的操作
if choice == 1:
# 添加元数据
if reader.metadata is not None:
writer.add_metadata(reader.metadata)
else:
print("没有元数据可以添加。")
elif choice == 2:
# 删除图像
writer.remove_images()
elif choice == 3:
# 降低图片质量
if quality is None:
raise ValueError("图片质量值未提供")
for page in tqdm(writer.pages, desc=f"Compressing images {pdf_file}"):
for img in page.images:
img.replace(img.image, quality=quality)
elif choice == 4:
# 使用无损压缩
for page in tqdm(writer.pages, desc=f"Applying lossless compression {pdf_file}"):
page.compress_content_streams()
elif choice == 5:
# 二值化压缩为tif
for page in tqdm(writer.pages, desc=f"Binarizing images {pdf_file}"):
for img in page.images:
img.replace(blacky(img.image))
# 如果PDF有书签,添加到写入器
if reader.outline:
add_bookmarks_to_writer(writer, reader, reader.outline)
# 生成输出文件名
output_file = f"reduced_{os.path.splitext(pdf_file)[0]}.pdf"
# 写入PDF到文件
with open(output_file, "wb") as f:
writer.write(f)
print(f"Processed file saved as {output_file}")
except Exception as e:
print(f"处理文件 {pdf_file} 时发生错误:{e}")
# 主函数
def main():
# 获取当前目录下所有PDF文件
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
if not pdf_files:
print("当前目录下没有找到PDF文件。")
return
# 用户交互部分,一次性获取所有输入
indices = input_indices(pdf_files)
choice = input_choice()
quality = input_quality(choice)
# 检查是否有输入错误,如果有则不执行任何操作
if indices is None or choice is None or (choice == 3 and quality is None):
return # 退出程序
# 获取CPU核心数
cpu_cores = os.cpu_count() or 1
# 使用ProcessPoolExecutor并行处理PDF文件
with concurrent.futures.ProcessPoolExecutor(max_workers=cpu_cores) as executor:
futures = [executor.submit(process_pdf, idx, choice, pdf_files[idx], quality if choice == 3 else None) for idx in indices]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as exc:
print(f"处理文件 {pdf_files[indices.index(idx)]} 时发生错误:{exc}")
# 打印当前目录下所有PDF文件及其索引
pdf_files = [f for f in os.listdir('.') if f.endswith('.pdf')]
for idx, file in enumerate(pdf_files):
print(f"{idx}: {file}")
# 从用户获取要处理的PDF文件索引
def input_indices(pdf_files):
while True:
selection = input("输入'all'以处理所有PDF文件,或者输入用逗号分隔的文件索引:").strip().lower()
if selection == 'all':
return range(len(pdf_files))
else:
try:
indices = [int(idx) for idx in selection.split(',')]
if all(0 <= idx < len(pdf_files) for idx in indices):
return indices
else:
print("所有索引必须在0到{}之间。".format(len(pdf_files)-1))
except ValueError:
print("输入无效。请输入'all'或者用逗号分隔的索引。")
# 从用户获取压缩PDF的方式选择
def input_choice():
while True:
try:
print("选择压缩PDF的方式")
print("1: 删除重复对象")
print("2: 删除图像")
print("3: 降低图片质量")
print("4: 使用无损压缩")
print("5: 二值化压缩为tif")
choice = int(input("输入选择 (1-5): "))
if choice < 1 or choice > 5:
raise ValueError
return choice
except ValueError:
print("输入无效。请输入1到5之间的数字。")
# 从用户获取图片质量值
def input_quality(choice):
if choice == 3:
while True:
quality = input("请输入图片质量(1-100):")
if quality.isdigit() and 1 <= int(quality) <= 100:
return int(quality)
else:
print("输入错误,请重新输入图片质量(1-100)。")
return None
# 程序入口
if __name__ == "__main__":
main()
|