背景
临时需要几本人教版教材,就跑去人民教育出版社查看。
但是离网环境阅读不便,于是就想说下载下来。
网站没做反爬,直接爬取就行。所以这只是段随手写的几行小脚本,方便自用而已,大佬勿喷。
另:其他版本教材可到国家中小学智慧教育平台获取。在对应教材阅读页打开开发者工具-网络,下载异步加载完成的pdf.pdf 文件即是。
教材版权归出版社所有,在此仅作代码交流用途。
地址
- 仓库:Textbook Crawler
代码
import os
import requests
from PIL import Image
class Crawler:
textbooks: dict[str, dict[str, int]]
def __init__(self, textbooks: dict[str, dict[str, int]]) -> None:
self.textbooks = textbooks
def download_pic(self, book: str) -> None:
'''
description: 下载图片
param {*} self
param {str} book
return {*}
author : Senkita
'''
os.makedirs(book, exist_ok=True)
book_id: int = self.textbooks[book]['id']
pages: int = self.textbooks[book]['pages']
for page in range(pages):
page_num: int = page + 1
url: str = 'https://book.pep.com.cn/{}/files/mobile/{}.jpg'.format(
book_id, page_num
)
res = requests.get(url)
with open('{}/{}.jpg'.format(book, page_num), 'wb') as f:
f.write(res.content)
@staticmethod
def sort_pic(book: str) -> list:
'''
description: 图片排序
param {str} book
return {list}
author : Senkita
'''
files: list = []
for file in os.listdir(book):
if file[-4:] == '.jpg':
files.append(file[:-4])
files.sort(key=lambda ele: int(ele))
return files
@staticmethod
def generate_pdf(book: str, files: list) -> None:
'''
description: 生成PDF
param {str} book
param {list} files
return {*}
author : Senkita
'''
pics: list = []
pdf: Image.Image = Image.open('{}/{}.jpg'.format(book, files[0]))
files.pop(0)
for pic in files:
img: Image.Image = Image.open('{}/{}.jpg'.format(book, pic))
if img.mode == 'RGBA':
img = img.convert('RGB')
pics.append(img)
pdf.save(
'./{}.pdf'.format(book),
'PDF',
resolution=100.0,
save_all=True,
quality=100, # 清晰度
subsampling=0,
append_images=pics,
)
def run(self) -> None:
for book in self.textbooks:
self.download_pic(book)
files: list = self.sort_pic(book)
self.generate_pdf(book, files)
|